diff --git a/asic/behavioural/Microwatt_FP_DFFRFile.v b/asic/behavioural/Microwatt_FP_DFFRFile.v new file mode 100644 index 0000000..b880c6c --- /dev/null +++ b/asic/behavioural/Microwatt_FP_DFFRFile.v @@ -0,0 +1,24 @@ +module Microwatt_FP_DFFRFile ( +`ifdef USE_POWER_PINS + inout VPWR, + inout VGND, +`endif + input [6:0] R1, R2, R3, RW, + input [63:0] DW, + output [63:0] D1, D2, D3, + input CLK, + input WE +); + + reg [63:0] registers[0:95]; + + assign D1 = registers[R1]; + assign D2 = registers[R2]; + assign D3 = registers[R3]; + + always @(posedge CLK) begin + if (WE) + registers[RW] <= DW; + end + +endmodule diff --git a/asic/behavioural/RAM32_1RW1R.v b/asic/behavioural/RAM32_1RW1R.v new file mode 100644 index 0000000..9ecb90b --- /dev/null +++ b/asic/behavioural/RAM32_1RW1R.v @@ -0,0 +1,40 @@ +module RAM32_1RW1R #( + parameter BITS=5 +) ( +`ifdef USE_POWER_PINS + inout VPWR, + inout VGND, +`endif + input CLK, + + input EN0, + input [BITS-1:0] A0, + input [7:0] WE0, + input [63:0] Di0, + output reg [63:0] Do0, + + input EN1, + input [BITS-1:0] A1, + output reg [63:0] Do1 +); + + reg [63:0] RAM[2**BITS-1:0]; + + always @(posedge CLK) begin + if (EN1) + Do1 <= RAM[A1]; + end + + generate + genvar i; + for (i=0; i<8; i=i+1) begin: BYTE + always @(posedge CLK) begin + if (EN0) begin + if (WE0[i]) + RAM[A0][i*8+7:i*8] <= Di0[i*8+7:i*8]; + end + end + end + endgenerate + +endmodule diff --git a/asic/behavioural/RAM512.v b/asic/behavioural/RAM512.v new file mode 100644 index 0000000..e232db9 --- /dev/null +++ b/asic/behavioural/RAM512.v @@ -0,0 +1,42 @@ +module RAM512 #( + parameter BITS=9, + parameter FILENAME="firmware.hex" +) ( +`ifdef USE_POWER_PINS + inout VPWR, + inout VGND, +`endif + input CLK, + input [7:0] WE0, + input EN0, + input [63:0] Di0, + output reg [63:0] Do0, + input [BITS-1:0] A0 +); + + reg [63:0] RAM[2**BITS-1:0]; + + always @(posedge CLK) begin + if (EN0) + Do0 <= RAM[A0]; + else + Do0 <= 64'b0; + end + + generate + genvar i; + for (i=0; i<8; i=i+1) begin: BYTE + always @(posedge CLK) begin + if (EN0) begin + if (WE0[i]) + RAM[A0][i*8+7:i*8] <= Di0[i*8+7:i*8]; + end + end + end + endgenerate + +initial begin + $readmemh(FILENAME, RAM); +end + +endmodule diff --git a/asic/behavioural/multiply_add_64x64.v b/asic/behavioural/multiply_add_64x64.v new file mode 100644 index 0000000..303ed24 --- /dev/null +++ b/asic/behavioural/multiply_add_64x64.v @@ -0,0 +1,24 @@ +module multiply_add_64x64 +#( + parameter BITS=64 +) ( +`ifdef USE_POWER_PINS + inout VPWR, + inout VGND, +`endif + input clk, + input [BITS-1:0] a, + input [BITS-1:0] b, + input [BITS*2-1:0] c, + output [BITS*2-1:0] o +); + reg [BITS*2-1:0] o_tmp[2:0]; + + always @(posedge clk) begin + o_tmp[2] = o_tmp[1]; + o_tmp[1] = o_tmp[0]; + o_tmp[0] = (a * b) + c; + end + + assign o = o_tmp[2]; +endmodule diff --git a/asic/cache_ram.vhdl b/asic/cache_ram.vhdl new file mode 100644 index 0000000..5b92abd --- /dev/null +++ b/asic/cache_ram.vhdl @@ -0,0 +1,99 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; +use ieee.math_real.all; + +entity cache_ram is + generic( + ROW_BITS : integer := 5; + WIDTH : integer := 64; + TRACE : boolean := false; + ADD_BUF : boolean := false + ); + + port( + clk : in std_logic; + + rd_en : in std_logic; + rd_addr : in std_logic_vector(ROW_BITS - 1 downto 0); + rd_data : out std_logic_vector(WIDTH - 1 downto 0); + + wr_sel : in std_logic_vector(WIDTH/8 - 1 downto 0); + wr_addr : in std_logic_vector(ROW_BITS - 1 downto 0); + wr_data : in std_logic_vector(WIDTH - 1 downto 0) + ); + +end cache_ram; + +architecture rtl of cache_ram is + component RAM32_1RW1R port( + CLK : in std_logic; + + EN0 : in std_logic; + A0 : in std_logic_vector(4 downto 0); + WE0 : in std_logic_vector(7 downto 0); + Di0 : in std_logic_vector(63 downto 0); + Do0 : out std_logic_vector(63 downto 0); + + EN1 : in std_logic; + A1 : in std_logic_vector(4 downto 0); + Do1 : out std_logic_vector(63 downto 0) + ); + end component; + + signal wr_enable: std_logic; + signal rd_data0_tmp : std_logic_vector(WIDTH - 1 downto 0); + signal rd_data0_saved : std_logic_vector(WIDTH - 1 downto 0); + signal rd_data0 : std_logic_vector(WIDTH - 1 downto 0); + signal rd_en_prev: std_ulogic; +begin + assert (ROW_BITS = 5) report "ROW_BITS must be 5" severity FAILURE; + assert (WIDTH = 64) report "Must be 64 bit" severity FAILURE; + assert (TRACE = false) report "Trace not supported" severity FAILURE; + + wr_enable <= or(wr_sel); + + cache_ram_0 : RAM32_1RW1R + port map ( + CLK => clk, + + EN0 => wr_enable, + A0 => wr_addr, + WE0 => wr_sel, + Di0 => wr_data, + Do0 => open, + + EN1 => rd_en, + A1 => rd_addr, + Do1 => rd_data0_tmp + ); + + -- The caches rely on cache_ram latching the last read. Handle it here + -- for now. + process(clk) + begin + if rising_edge(clk) then + rd_en_prev <= rd_en; + if rd_en_prev = '1' then + rd_data0_saved <= rd_data0_tmp; + end if; + end if; + end process; + rd_data0 <= rd_data0_tmp when rd_en_prev = '1' else rd_data0_saved; + + buf: if ADD_BUF generate + begin + process(clk) + begin + if rising_edge(clk) then + rd_data <= rd_data0; + end if; + end process; + end generate; + + nobuf: if not ADD_BUF generate + begin + rd_data <= rd_data0; + end generate; + +end architecture rtl; diff --git a/asic/main_bram.vhdl b/asic/main_bram.vhdl new file mode 100644 index 0000000..add4b29 --- /dev/null +++ b/asic/main_bram.vhdl @@ -0,0 +1,63 @@ +library ieee; +use ieee.std_logic_1164.all; + +library work; + +entity main_bram is + generic( + WIDTH : natural := 64; + HEIGHT_BITS : natural; + MEMORY_SIZE : natural; + RAM_INIT_FILE : string + ); + port( + clk : in std_logic; + addr : in std_logic_vector(HEIGHT_BITS - 1 downto 0) ; + din : in std_logic_vector(WIDTH-1 downto 0); + dout : out std_logic_vector(WIDTH-1 downto 0); + sel : in std_logic_vector((WIDTH/8)-1 downto 0); + re : in std_ulogic; + we : in std_ulogic + ); +end entity main_bram; + +architecture behaviour of main_bram is + component RAM512 port ( + CLK : in std_ulogic; + WE0 : in std_ulogic_vector(7 downto 0); + EN0 : in std_ulogic; + Di0 : in std_ulogic_vector(63 downto 0); + Do0 : out std_ulogic_vector(63 downto 0); + A0 : in std_ulogic_vector(8 downto 0) + ); + end component; + + signal sel_qual: std_ulogic_vector((WIDTH/8)-1 downto 0); + + signal obuf : std_logic_vector(WIDTH-1 downto 0); +begin + assert (WIDTH = 64) report "Must be 64 bit" severity FAILURE; + -- Do we have a log2 round up issue here? + assert (HEIGHT_BITS = 9) report "HEIGHT_BITS must be 10" severity FAILURE; + assert (MEMORY_SIZE = 4096) report "MEMORY_SIZE must be 4096" severity FAILURE; + + sel_qual <= sel when we = '1' else (others => '0'); + + memory_0 : RAM512 + port map ( + CLK => clk, + WE0 => sel_qual(7 downto 0), + EN0 => re or we, + Di0 => din(63 downto 0), + Do0 => obuf(63 downto 0), + A0 => addr(8 downto 0) + ); + + -- The wishbone BRAM wrapper assumes a 1 cycle delay + memory_read_buffer: process(clk) + begin + if rising_edge(clk) then + dout <= obuf; + end if; + end process; +end architecture behaviour; diff --git a/asic/multiply.vhdl b/asic/multiply.vhdl new file mode 100644 index 0000000..a604554 --- /dev/null +++ b/asic/multiply.vhdl @@ -0,0 +1,128 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library work; +use work.common.all; + +-- XXX We should be able to make timing with a 2 cycle multiplier +entity multiply is + generic ( + PIPELINE_DEPTH : natural := 4 + ); + port ( + clk : in std_logic; + + m_in : in MultiplyInputType; + m_out : out MultiplyOutputType + ); +end entity multiply; + +architecture behaviour of multiply is + signal m: MultiplyInputType := MultiplyInputInit; + + type multiply_pipeline_stage is record + valid : std_ulogic; + is_32bit : std_ulogic; + not_res : std_ulogic; + end record; + constant MultiplyPipelineStageInit : multiply_pipeline_stage := (valid => '0', + is_32bit => '0', + not_res => '0'); + + type multiply_pipeline_type is array(0 to PIPELINE_DEPTH-1) of multiply_pipeline_stage; + constant MultiplyPipelineInit : multiply_pipeline_type := (others => MultiplyPipelineStageInit); + + type reg_type is record + multiply_pipeline : multiply_pipeline_type; + end record; + + signal r, rin : reg_type := (multiply_pipeline => MultiplyPipelineInit); + signal overflow : std_ulogic; + signal ovf_in : std_ulogic; + + signal mult_out : std_logic_vector(127 downto 0); + + component multiply_add_64x64 port( + clk : in std_logic; + a : in std_logic_vector(63 downto 0); + b : in std_logic_vector(63 downto 0); + c : in std_logic_vector(127 downto 0); + o : out std_logic_vector(127 downto 0) + ); + end component; +begin + multiply_0: process(clk) + begin + if rising_edge(clk) then + m <= m_in; + r <= rin; + overflow <= ovf_in; + end if; + end process; + + multiplier : multiply_add_64x64 + port map ( + clk => clk, + a => m.data1, + b => m.data2, + c => m.addend, + o => mult_out + ); + + multiply_1: process(all) + variable v : reg_type; + variable d : std_ulogic_vector(127 downto 0); + variable d2 : std_ulogic_vector(63 downto 0); + variable ov : std_ulogic; + begin + v := r; + v.multiply_pipeline(0).valid := m.valid; + v.multiply_pipeline(0).is_32bit := m.is_32bit; + v.multiply_pipeline(0).not_res := m.not_result; + + loop_0: for i in 1 to PIPELINE_DEPTH-1 loop + v.multiply_pipeline(i) := r.multiply_pipeline(i-1); + end loop; + + if v.multiply_pipeline(PIPELINE_DEPTH-1).not_res = '1' then + d := not mult_out; + else + d := mult_out; + end if; + + ov := '0'; + if v.multiply_pipeline(PIPELINE_DEPTH-1).is_32bit = '1' then + ov := (or d(63 downto 31)) and not (and d(63 downto 31)); + else + ov := (or d(127 downto 63)) and not (and d(127 downto 63)); + end if; + ovf_in <= ov; + + m_out.result <= d; + m_out.overflow <= overflow; + m_out.valid <= v.multiply_pipeline(PIPELINE_DEPTH-1).valid; + + rin <= v; + end process; +end architecture behaviour; + + +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +entity short_multiply is + port ( + clk : in std_ulogic; + + a_in : in std_ulogic_vector(15 downto 0); + b_in : in std_ulogic_vector(15 downto 0); + m_out : out std_ulogic_vector(31 downto 0) + ); +end entity short_multiply; + +architecture behaviour of short_multiply is +begin + m_out <= std_ulogic_vector(signed(a_in) * signed(b_in)); +end architecture behaviour; diff --git a/asic/register_file.vhdl b/asic/register_file.vhdl new file mode 100644 index 0000000..e2f17f5 --- /dev/null +++ b/asic/register_file.vhdl @@ -0,0 +1,103 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library work; +use work.common.all; + +entity register_file is + generic ( + SIM : boolean := false; + HAS_FPU : boolean := true; + LOG_LENGTH : natural := 0 + ); + port( + clk : in std_logic; + + d_in : in Decode2ToRegisterFileType; + d_out : out RegisterFileToDecode2Type; + + w_in : in WritebackToRegisterFileType; + + dbg_gpr_req : in std_ulogic; + dbg_gpr_ack : out std_ulogic; + dbg_gpr_addr : in gspr_index_t; + dbg_gpr_data : out std_ulogic_vector(63 downto 0); + + sim_dump : in std_ulogic; + sim_dump_done : out std_ulogic; + + log_out : out std_ulogic_vector(71 downto 0) + ); +end entity register_file; + +architecture behaviour of register_file is + component Microwatt_FP_DFFRFile port ( + CLK : in std_ulogic; + + R1 : in std_ulogic_vector(6 downto 0); + R2 : in std_ulogic_vector(6 downto 0); + R3 : in std_ulogic_vector(6 downto 0); + + D1 : out std_ulogic_vector(63 downto 0); + D2 : out std_ulogic_vector(63 downto 0); + D3 : out std_ulogic_vector(63 downto 0); + + WE : in std_ulogic; + RW : in std_ulogic_vector(6 downto 0); + DW : in std_ulogic_vector(63 downto 0) + ); + end component; + + signal d1: std_ulogic_vector(63 downto 0); + signal d2: std_ulogic_vector(63 downto 0); + signal d3: std_ulogic_vector(63 downto 0); +begin + + register_file_0 : Microwatt_FP_DFFRFile + port map ( + CLK => clk, + + R1 => d_in.read1_reg, + R2 => d_in.read2_reg, + R3 => d_in.read3_reg, + + D1 => d1, + D2 => d2, + D3 => d3, + + WE => w_in.write_enable, + RW => w_in.write_reg, + DW => w_in.write_data + ); + + x_state_check: process(clk) + begin + if rising_edge(clk) then + if w_in.write_enable = '1' then + assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure; + end if; + end if; + end process x_state_check; + + -- Forward any written data + register_read_0: process(all) + begin + d_out.read1_data <= d1; + d_out.read2_data <= d2; + d_out.read3_data <= d3; + + if w_in.write_enable = '1' then + if d_in.read1_reg = w_in.write_reg then + d_out.read1_data <= w_in.write_data; + end if; + if d_in.read2_reg = w_in.write_reg then + d_out.read2_data <= w_in.write_data; + end if; + if d_in.read3_reg = w_in.write_reg then + d_out.read3_data <= w_in.write_data; + end if; + end if; + end process register_read_0; + +end architecture behaviour;