Compare commits

..

15 Commits

Author SHA1 Message Date
Anton Blanchard 2083bc3ed0 ASIC: Fix multiplier power 12 months ago
Anton Blanchard ace41e5153 ASIC: Reduce multiplier from 4 to 2 cycles 1 year ago
Anton Blanchard 907c833521 Move register stage back after the RAM 1 year ago
Anton Blanchard 5c40143754 Add a script to post process the Microwatt verilog for caravel 1 year ago
Anton Blanchard d0ced7441f Add simplebus verilog 1 year ago
Anton Blanchard 606359cce3 Add simplebus 1 year ago
Anton Blanchard 49b332e17f Hook up JTAG to ASIC top level 1 year ago
Anton Blanchard d5e3be80fe Set a unique PVR for caravel MPW5 1 year ago
Anton Blanchard d504ae63f6 Update JTAG TAP controller for Microwatt 1 year ago
Anton Blanchard 15bb4aa90a First pass at an external JTAG port 1 year ago
Anton Blanchard c2577b5446 Add ASIC target 1 year ago
Anton Blanchard 5249d633cf Move register stage from after RAM to before RAM 1 year ago
Anton Blanchard 8ecb30da05 Add arrays for ASIC flow 1 year ago
Anton Blanchard 747c96b100 Cut down hello_world to fit in 4kB 1 year ago
Anton Blanchard faab169307 Allow ALT_RESET_ADDRESS to be overridden 1 year ago
  1. 34
      Makefile
  2. 72
      README.md
  3. 24
      asic/behavioural/Microwatt_FP_DFFRFile.v
  4. 40
      asic/behavioural/RAM32_1RW1R.v
  5. 42
      asic/behavioural/RAM512.v
  6. 22
      asic/behavioural/multiply_add_64x64.v
  7. 99
      asic/cache_ram.vhdl
  8. 63
      asic/main_bram.vhdl
  9. 83
      asic/microwatt_asic-verilator.cpp
  10. 127
      asic/multiply.vhdl
  11. 103
      asic/register_file.vhdl
  12. 285
      asic/top-asic.vhdl
  13. 72
      caravel/insert_power.py
  14. 37
      caravel/process-microwatt-verilog.sh
  15. 2
      common.vhdl
  16. 6
      control.vhdl
  17. 23
      core.vhdl
  18. 1
      core_debug.vhdl
  19. 1
      dcache.vhdl
  20. 2
      divider.vhdl
  21. 302
      dmi_dtm_jtag.vhdl
  22. 7
      execute1.vhdl
  23. 6
      fpu.vhdl
  24. 4
      gpio.vhdl
  25. 57
      hello_world/head.S
  26. BIN
      hello_world/hello_world.bin
  27. BIN
      hello_world/hello_world.elf
  28. 688
      hello_world/hello_world.hex
  29. 28
      hello_world/powerpc.lds
  30. 9
      icache.vhdl
  31. 3
      icache_tb.vhdl
  32. 636
      jtag_tap/tap_top.v
  33. 19
      loadstore1.vhdl
  34. 1319
      simplebus_host.v
  35. 83
      soc.vhdl
  36. 12
      spi_flash_ctrl.vhdl
  37. 9
      spi_rxtx.vhdl
  38. 196
      verilator/jtag-verilator.c
  39. 17
      verilator/microwatt-verilator.cpp

34
Makefile

@ -55,15 +55,18 @@ all = core_tb icache_tb dcache_tb dmi_dtm_tb \ @@ -55,15 +55,18 @@ all = core_tb icache_tb dcache_tb dmi_dtm_tb \

all: $(all)

core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \
utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl \
base_core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \
utils.vhdl plru.vhdl icache.vhdl \
decode1.vhdl helpers.vhdl insn_helpers.vhdl \
control.vhdl decode2.vhdl register_file.vhdl \
control.vhdl decode2.vhdl \
cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \
logical.vhdl countbits.vhdl multiply.vhdl divider.vhdl execute1.vhdl \
logical.vhdl countbits.vhdl divider.vhdl execute1.vhdl \
loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl core_debug.vhdl \
core.vhdl fpu.vhdl pmu.vhdl

core_files = $(base_core_files) register_file.vhdl cache_ram.vhdl multiply.vhdl
asic_core_files = $(base_core_files) asic/register_file.vhdl asic/cache_ram.vhdl asic/multiply.vhdl

soc_files = wishbone_arbiter.vhdl wishbone_bram_wrapper.vhdl sync_fifo.vhdl \
wishbone_debug_master.vhdl xics.vhdl syscon.vhdl gpio.vhdl soc.vhdl \
spi_rxtx.vhdl spi_flash_ctrl.vhdl
@ -160,7 +163,7 @@ ICACHE_NUM_LINES=4 @@ -160,7 +163,7 @@ ICACHE_NUM_LINES=4

clkgen=fpga/clk_gen_ecp5.vhd
toplevel=fpga/top-generic.vhdl
dmi_dtm=dmi_dtm_dummy.vhdl
dmi_dtm=dmi_dtm_jtag.vhdl dmi_dtm_dummy.vhdl
LITEDRAM_GHDL_ARG=

# OrangeCrab with ECP85 (original v0.0 with UM5G-85 chip)
@ -218,7 +221,6 @@ GHDL_IMAGE_GENERICS=-gMEMORY_SIZE=$(MEMORY_SIZE) -gRAM_INIT_FILE=$(RAM_INIT_FILE @@ -218,7 +221,6 @@ GHDL_IMAGE_GENERICS=-gMEMORY_SIZE=$(MEMORY_SIZE) -gRAM_INIT_FILE=$(RAM_INIT_FILE
-gRESET_LOW=$(RESET_LOW) -gCLK_INPUT=$(CLK_INPUT) -gCLK_FREQUENCY=$(CLK_FREQUENCY) -gICACHE_NUM_LINES=$(ICACHE_NUM_LINES) \
$(LITEDRAM_GHDL_ARG)


ifeq ($(FPGA_TARGET), verilator)
RESET_LOW=true
CLK_INPUT=50000000
@ -226,22 +228,41 @@ CLK_FREQUENCY=50000000 @@ -226,22 +228,41 @@ CLK_FREQUENCY=50000000
clkgen=fpga/clk_gen_bypass.vhd
endif

ifeq ($(FPGA_TARGET), caravel)
MEMORY_SIZE=4096
RESET_LOW=true
CLK_INPUT=100000000
CLK_FREQUENCY=100000000
endif

fpga_files = fpga/soc_reset.vhdl \
fpga/pp_fifo.vhd fpga/pp_soc_uart.vhd fpga/main_bram.vhdl \
nonrandom.vhdl

asic_files = fpga/pp_fifo.vhd fpga/pp_soc_uart.vhd asic/main_bram.vhdl \
asic/top-asic.vhdl $(dmi_dtm) nonrandom.vhdl

synth_files = $(core_files) $(soc_files) $(soc_extra_synth) $(fpga_files) $(clkgen) $(toplevel) $(dmi_dtm)

asic_synth_files = $(asic_core_files) $(soc_files) $(asic_files)

microwatt.json: $(synth_files) $(RAM_INIT_FILE)
$(YOSYS) $(GHDLSYNTH) -p "ghdl --std=08 --no-formal $(GHDL_IMAGE_GENERICS) $(synth_files) -e toplevel; read_verilog $(uart_files) $(soc_extra_v); synth_ecp5 -abc9 -nowidelut -json $@ $(SYNTH_ECP5_FLAGS)"

microwatt.v: $(synth_files) $(RAM_INIT_FILE)
$(YOSYS) $(GHDLSYNTH) -p "ghdl --std=08 --no-formal $(GHDL_IMAGE_GENERICS) $(synth_files) -e toplevel; write_verilog $@"

microwatt_asic.v: $(asic_synth_files)
$(YOSYS) $(GHDLSYNTH) -p "ghdl --std=08 --no-formal $(GHDL_IMAGE_GENERICS) $(asic_synth_files) -e toplevel; write_verilog $@"

microwatt-verilator: microwatt.v verilator/microwatt-verilator.cpp verilator/uart-verilator.c
$(VERILATOR) $(VERILATOR_FLAGS) -CFLAGS "$(VERILATOR_CFLAGS) -DCLK_FREQUENCY=$(CLK_FREQUENCY)" -Iuart16550 --assert --cc --exe --build $^ -o $@ -top-module toplevel
@cp -f obj_dir/microwatt-verilator microwatt-verilator

microwatt_asic-verilator: microwatt_asic.v asic/microwatt_asic-verilator.cpp verilator/uart-verilator.c verilator/jtag-verilator.c
$(VERILATOR) $(VERILATOR_FLAGS) -CFLAGS "$(VERILATOR_CFLAGS) -DCLK_FREQUENCY=$(CLK_FREQUENCY)" -Iuart16550 -Iasic/behavioural -Ijtag_tap --assert --cc --exe --build $^ -o $@ -top-module toplevel
@cp -f obj_dir/microwatt_asic-verilator microwatt_asic-verilator

microwatt_out.config: microwatt.json $(LPF)
$(NEXTPNR) --json $< --lpf $(LPF) --textcfg $@.tmp $(NEXTPNR_FLAGS) --package $(PACKAGE)
mv -f $@.tmp $@
@ -324,6 +345,7 @@ _clean: @@ -324,6 +345,7 @@ _clean:
rm -f scripts/mw_debug/mw_debug
rm -f microwatt.bin microwatt.json microwatt.svf microwatt_out.config
rm -f microwatt.v microwatt-verilator
rm -f microwatt_asic.v microwatt_asic-verilator
rm -rf obj_dir/

clean: _clean

72
README.md

@ -103,8 +103,14 @@ sudo dnf install fusesoc @@ -103,8 +103,14 @@ sudo dnf install fusesoc

```
fusesoc init
fusesoc fetch uart16550
fusesoc library add microwatt /path/to/microwatt
```

- Create a working directory and point FuseSoC at microwatt:

```
mkdir microwatt-fusesoc
cd microwatt-fusesoc
fusesoc library add microwatt /path/to/microwatt/
```

- Build using FuseSoC. For hello world (Replace nexys_video with your FPGA board such as --target=arty_a7-100):
@ -122,68 +128,6 @@ You should then be able to see output via the serial port of the board (/dev/tty @@ -122,68 +128,6 @@ You should then be able to see output via the serial port of the board (/dev/tty
fusesoc run --target=nexys_video microwatt
```

## Linux on Microwatt

Mainline Linux supports Microwatt as of v5.14. The Arty A7 is the best tested
platform, but it's also been tested on the OrangeCrab and ButterStick.

1. Use buildroot to create a userspace

A small change is required to glibc in order to support the VMX/AltiVec-less
Microwatt, as float128 support is mandiatory and for this in GCC requires
VSX/AltiVec. This change is included in Joel's buildroot fork, along with a
defconfig:
```
git clone -b microwatt https://github.com/shenki/buildroot
cd buildroot
make ppc64le_microwatt_defconfig
make
```

The output is `output/images/rootfs.cpio`.

2. Build the Linux kernel
```
git clone https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
cd linux
make ARCH=powerpc microwatt_defconfig
make ARCH=powerpc CROSS_COMPILE=powerpc64le-linux-gnu- \
CONFIG_INITRAMFS_SOURCE=/buildroot/output/images/rootfs.cpio -j`nproc`
```

The output is `arch/powerpc/boot/dtbImage.microwatt.elf`.

3. Build gateware using FuseSoC

First configure FuseSoC as above.
```
fusesoc run --build --target=arty_a7-100 microwatt --no_bram --memory_size=0
```

The output is `build/microwatt_0/arty_a7-100-vivado/microwatt_0.bit`.

4. Program the flash

This operation will overwrite the contents of your flash.

For the Arty A7 A100, set `FLASH_ADDRESS` to `0x400000` and pass `-f a100`.

For the Arty A7 A35, set `FLASH_ADDRESS` to `0x300000` and pass `-f a35`.
```
microwatt/openocd/flash-arty -f a100 build/microwatt_0/arty_a7-100-vivado/microwatt_0.bit
microwatt/openocd/flash-arty -f a100 dtbImage.microwatt.elf -t bin -a $FLASH_ADDRESS
```

5. Connect to the second USB TTY device exposed by the FPGA

```
minicom -D /dev/ttyUSB1
```

The gateware has firmware that will look at `FLASH_ADDRESS` and attempt to
parse an ELF there, loading it to the address specified in the ELF header
and jumping to it.

## Testing

- A simple test suite containing random execution test cases and a couple of

24
asic/behavioural/Microwatt_FP_DFFRFile.v

@ -0,0 +1,24 @@ @@ -0,0 +1,24 @@
module Microwatt_FP_DFFRFile (
`ifdef USE_POWER_PINS
inout VPWR,
inout VGND,
`endif
input [6:0] R1, R2, R3, RW,
input [63:0] DW,
output [63:0] D1, D2, D3,
input CLK,
input WE
);

reg [63:0] registers[0:95];

assign D1 = registers[R1];
assign D2 = registers[R2];
assign D3 = registers[R3];

always @(posedge CLK) begin
if (WE)
registers[RW] <= DW;
end

endmodule

40
asic/behavioural/RAM32_1RW1R.v

@ -0,0 +1,40 @@ @@ -0,0 +1,40 @@
module RAM32_1RW1R #(
parameter BITS=5
) (
`ifdef USE_POWER_PINS
inout VPWR,
inout VGND,
`endif
input CLK,

input EN0,
input [BITS-1:0] A0,
input [7:0] WE0,
input [63:0] Di0,
output reg [63:0] Do0,

input EN1,
input [BITS-1:0] A1,
output reg [63:0] Do1
);

reg [63:0] RAM[2**BITS-1:0];

always @(posedge CLK) begin
if (EN1)
Do1 <= RAM[A1];
end

generate
genvar i;
for (i=0; i<8; i=i+1) begin: BYTE
always @(posedge CLK) begin
if (EN0) begin
if (WE0[i])
RAM[A0][i*8+7:i*8] <= Di0[i*8+7:i*8];
end
end
end
endgenerate

endmodule

42
asic/behavioural/RAM512.v

@ -0,0 +1,42 @@ @@ -0,0 +1,42 @@
module RAM512 #(
parameter BITS=9,
parameter FILENAME="firmware.hex"
) (
`ifdef USE_POWER_PINS
inout VPWR,
inout VGND,
`endif
input CLK,
input [7:0] WE0,
input EN0,
input [63:0] Di0,
output reg [63:0] Do0,
input [BITS-1:0] A0
);

reg [63:0] RAM[2**BITS-1:0];

always @(posedge CLK) begin
if (EN0)
Do0 <= RAM[A0];
else
Do0 <= 64'b0;
end

generate
genvar i;
for (i=0; i<8; i=i+1) begin: BYTE
always @(posedge CLK) begin
if (EN0) begin
if (WE0[i])
RAM[A0][i*8+7:i*8] <= Di0[i*8+7:i*8];
end
end
end
endgenerate

initial begin
$readmemh(FILENAME, RAM);
end

endmodule

22
asic/behavioural/multiply_add_64x64.v

@ -0,0 +1,22 @@ @@ -0,0 +1,22 @@
module multiply_add_64x64
#(
parameter BITS=64
) (
`ifdef USE_POWER_PINS
inout VPWR,
inout VGND,
`endif
input clk,
input [BITS-1:0] a,
input [BITS-1:0] b,
input [BITS*2-1:0] c,
output [BITS*2-1:0] o
);
reg [BITS*2-1:0] o_tmp;

always @(posedge clk) begin
o_tmp = (a * b) + c;
end

assign o = o_tmp;
endmodule

99
asic/cache_ram.vhdl

@ -0,0 +1,99 @@ @@ -0,0 +1,99 @@
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
use ieee.math_real.all;

entity cache_ram is
generic(
ROW_BITS : integer := 5;
WIDTH : integer := 64;
TRACE : boolean := false;
ADD_BUF : boolean := false
);

port(
clk : in std_logic;

rd_en : in std_logic;
rd_addr : in std_logic_vector(ROW_BITS - 1 downto 0);
rd_data : out std_logic_vector(WIDTH - 1 downto 0);

wr_sel : in std_logic_vector(WIDTH/8 - 1 downto 0);
wr_addr : in std_logic_vector(ROW_BITS - 1 downto 0);
wr_data : in std_logic_vector(WIDTH - 1 downto 0)
);

end cache_ram;

architecture rtl of cache_ram is
component RAM32_1RW1R port(
CLK : in std_logic;

EN0 : in std_logic;
A0 : in std_logic_vector(4 downto 0);
WE0 : in std_logic_vector(7 downto 0);
Di0 : in std_logic_vector(63 downto 0);
Do0 : out std_logic_vector(63 downto 0);

EN1 : in std_logic;
A1 : in std_logic_vector(4 downto 0);
Do1 : out std_logic_vector(63 downto 0)
);
end component;

signal wr_enable: std_logic;
signal rd_data0_tmp : std_logic_vector(WIDTH - 1 downto 0);
signal rd_data0_saved : std_logic_vector(WIDTH - 1 downto 0);
signal rd_data0 : std_logic_vector(WIDTH - 1 downto 0);
signal rd_en_prev: std_ulogic;
begin
assert (ROW_BITS = 5) report "ROW_BITS must be 5" severity FAILURE;
assert (WIDTH = 64) report "Must be 64 bit" severity FAILURE;
assert (TRACE = false) report "Trace not supported" severity FAILURE;

wr_enable <= or(wr_sel);

cache_ram_0 : RAM32_1RW1R
port map (
CLK => clk,

EN0 => wr_enable,
A0 => wr_addr,
WE0 => wr_sel,
Di0 => wr_data,
Do0 => open,

EN1 => rd_en,
A1 => rd_addr,
Do1 => rd_data0_tmp
);

-- The caches rely on cache_ram latching the last read. Handle it here
-- for now.
process(clk)
begin
if rising_edge(clk) then
rd_en_prev <= rd_en;
if rd_en_prev = '1' then
rd_data0_saved <= rd_data0_tmp;
end if;
end if;
end process;
rd_data0 <= rd_data0_tmp when rd_en_prev = '1' else rd_data0_saved;

buf: if ADD_BUF generate
begin
process(clk)
begin
if rising_edge(clk) then
rd_data <= rd_data0;
end if;
end process;
end generate;

nobuf: if not ADD_BUF generate
begin
rd_data <= rd_data0;
end generate;

end architecture rtl;

63
asic/main_bram.vhdl

@ -0,0 +1,63 @@ @@ -0,0 +1,63 @@
library ieee;
use ieee.std_logic_1164.all;

library work;

entity main_bram is
generic(
WIDTH : natural := 64;
HEIGHT_BITS : natural;
MEMORY_SIZE : natural;
RAM_INIT_FILE : string
);
port(
clk : in std_logic;
addr : in std_logic_vector(HEIGHT_BITS - 1 downto 0) ;
din : in std_logic_vector(WIDTH-1 downto 0);
dout : out std_logic_vector(WIDTH-1 downto 0);
sel : in std_logic_vector((WIDTH/8)-1 downto 0);
re : in std_ulogic;
we : in std_ulogic
);
end entity main_bram;

architecture behaviour of main_bram is
component RAM512 port (
CLK : in std_ulogic;
WE0 : in std_ulogic_vector(7 downto 0);
EN0 : in std_ulogic;
Di0 : in std_ulogic_vector(63 downto 0);
Do0 : out std_ulogic_vector(63 downto 0);
A0 : in std_ulogic_vector(8 downto 0)
);
end component;

signal sel_qual: std_ulogic_vector((WIDTH/8)-1 downto 0);

signal obuf : std_logic_vector(WIDTH-1 downto 0);
begin
assert (WIDTH = 64) report "Must be 64 bit" severity FAILURE;
-- Do we have a log2 round up issue here?
assert (HEIGHT_BITS = 9) report "HEIGHT_BITS must be 10" severity FAILURE;
assert (MEMORY_SIZE = 4096) report "MEMORY_SIZE must be 4096" severity FAILURE;

sel_qual <= sel when we = '1' else (others => '0');

memory_0 : RAM512
port map (
CLK => clk,
WE0 => sel_qual(7 downto 0),
EN0 => re or we,
Di0 => din(63 downto 0),
Do0 => obuf(63 downto 0),
A0 => addr(8 downto 0)
);

-- The wishbone BRAM wrapper assumes a 1 cycle delay
memory_read_buffer: process(clk)
begin
if rising_edge(clk) then
dout <= obuf;
end if;
end process;
end architecture behaviour;

83
asic/microwatt_asic-verilator.cpp

@ -0,0 +1,83 @@ @@ -0,0 +1,83 @@
#include <stdlib.h>
#include "Vtoplevel.h"
#include "verilated.h"
#include "verilated_vcd_c.h"

/*
* Current simulation time
* This is a 64-bit integer to reduce wrap over issues and
* allow modulus. You can also use a double, if you wish.
*/
vluint64_t main_time = 0;

/*
* Called by $time in Verilog
* converts to double, to match
* what SystemC does
*/
double sc_time_stamp(void)
{
return main_time;
}

#if VM_TRACE
VerilatedVcdC *tfp;
#endif

void tick(Vtoplevel *top)
{
top->ext_clk = 1;
top->eval();
#if VM_TRACE
if (tfp)
tfp->dump((double) main_time);
#endif
main_time++;

top->ext_clk = 0;
top->eval();
#if VM_TRACE
if (tfp)
tfp->dump((double) main_time);
#endif
main_time++;
}

void uart_tx(unsigned char tx);
unsigned char uart_rx(void);

int main(int argc, char **argv)
{
Verilated::commandArgs(argc, argv);

// init top verilog instance
Vtoplevel* top = new Vtoplevel;

#if VM_TRACE
// init trace dump
Verilated::traceEverOn(true);
tfp = new VerilatedVcdC;
top->trace(tfp, 99);
tfp->open("microwatt-verilator.vcd");
#endif

// Reset
top->ext_rst = 0;
for (unsigned long i = 0; i < 5; i++)
tick(top);
top->ext_rst = 1;

while(!Verilated::gotFinish()) {
tick(top);

uart_tx(top->uart0_txd);
top->uart0_rxd = uart_rx();
}

#if VM_TRACE
tfp->close();
delete tfp;
#endif

delete top;
}

127
asic/multiply.vhdl

@ -0,0 +1,127 @@ @@ -0,0 +1,127 @@
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

library work;
use work.common.all;

entity multiply is
generic (
PIPELINE_DEPTH : natural := 2
);
port (
clk : in std_logic;

m_in : in MultiplyInputType;
m_out : out MultiplyOutputType
);
end entity multiply;

architecture behaviour of multiply is
signal m: MultiplyInputType := MultiplyInputInit;

type multiply_pipeline_stage is record
valid : std_ulogic;
is_32bit : std_ulogic;
not_res : std_ulogic;
end record;
constant MultiplyPipelineStageInit : multiply_pipeline_stage := (valid => '0',
is_32bit => '0',
not_res => '0');

type multiply_pipeline_type is array(0 to PIPELINE_DEPTH-1) of multiply_pipeline_stage;
constant MultiplyPipelineInit : multiply_pipeline_type := (others => MultiplyPipelineStageInit);

type reg_type is record
multiply_pipeline : multiply_pipeline_type;
end record;

signal r, rin : reg_type := (multiply_pipeline => MultiplyPipelineInit);
signal overflow : std_ulogic;
signal ovf_in : std_ulogic;

signal mult_out : std_logic_vector(127 downto 0);

component multiply_add_64x64 port(
clk : in std_logic;
a : in std_logic_vector(63 downto 0);
b : in std_logic_vector(63 downto 0);
c : in std_logic_vector(127 downto 0);
o : out std_logic_vector(127 downto 0)
);
end component;
begin
multiply_0: process(clk)
begin
if rising_edge(clk) then
m <= m_in;
r <= rin;
overflow <= ovf_in;
end if;
end process;

multiplier : multiply_add_64x64
port map (
clk => clk,
a => m.data1,
b => m.data2,
c => m.addend,
o => mult_out
);

multiply_1: process(all)
variable v : reg_type;
variable d : std_ulogic_vector(127 downto 0);
variable d2 : std_ulogic_vector(63 downto 0);
variable ov : std_ulogic;
begin
v := r;
v.multiply_pipeline(0).valid := m.valid;
v.multiply_pipeline(0).is_32bit := m.is_32bit;
v.multiply_pipeline(0).not_res := m.not_result;

loop_0: for i in 1 to PIPELINE_DEPTH-1 loop
v.multiply_pipeline(i) := r.multiply_pipeline(i-1);
end loop;

if v.multiply_pipeline(PIPELINE_DEPTH-1).not_res = '1' then
d := not mult_out;
else
d := mult_out;
end if;

ov := '0';
if v.multiply_pipeline(PIPELINE_DEPTH-1).is_32bit = '1' then
ov := (or d(63 downto 31)) and not (and d(63 downto 31));
else
ov := (or d(127 downto 63)) and not (and d(127 downto 63));
end if;
ovf_in <= ov;

m_out.result <= d;
m_out.overflow <= overflow;
m_out.valid <= v.multiply_pipeline(PIPELINE_DEPTH-1).valid;

rin <= v;
end process;
end architecture behaviour;


library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

entity short_multiply is
port (
clk : in std_ulogic;

a_in : in std_ulogic_vector(15 downto 0);
b_in : in std_ulogic_vector(15 downto 0);
m_out : out std_ulogic_vector(31 downto 0)
);
end entity short_multiply;

architecture behaviour of short_multiply is
begin
m_out <= std_ulogic_vector(signed(a_in) * signed(b_in));
end architecture behaviour;

103
asic/register_file.vhdl

@ -0,0 +1,103 @@ @@ -0,0 +1,103 @@
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

library work;
use work.common.all;

entity register_file is
generic (
SIM : boolean := false;
HAS_FPU : boolean := true;
LOG_LENGTH : natural := 0
);
port(
clk : in std_logic;

d_in : in Decode2ToRegisterFileType;
d_out : out RegisterFileToDecode2Type;

w_in : in WritebackToRegisterFileType;

dbg_gpr_req : in std_ulogic;
dbg_gpr_ack : out std_ulogic;
dbg_gpr_addr : in gspr_index_t;
dbg_gpr_data : out std_ulogic_vector(63 downto 0);

sim_dump : in std_ulogic;
sim_dump_done : out std_ulogic;

log_out : out std_ulogic_vector(71 downto 0)
);
end entity register_file;

architecture behaviour of register_file is
component Microwatt_FP_DFFRFile port (
CLK : in std_ulogic;

R1 : in std_ulogic_vector(6 downto 0);
R2 : in std_ulogic_vector(6 downto 0);
R3 : in std_ulogic_vector(6 downto 0);

D1 : out std_ulogic_vector(63 downto 0);
D2 : out std_ulogic_vector(63 downto 0);
D3 : out std_ulogic_vector(63 downto 0);

WE : in std_ulogic;
RW : in std_ulogic_vector(6 downto 0);
DW : in std_ulogic_vector(63 downto 0)
);
end component;

signal d1: std_ulogic_vector(63 downto 0);
signal d2: std_ulogic_vector(63 downto 0);
signal d3: std_ulogic_vector(63 downto 0);
begin

register_file_0 : Microwatt_FP_DFFRFile
port map (
CLK => clk,

R1 => d_in.read1_reg,
R2 => d_in.read2_reg,
R3 => d_in.read3_reg,

D1 => d1,
D2 => d2,
D3 => d3,

WE => w_in.write_enable,
RW => w_in.write_reg,
DW => w_in.write_data
);

x_state_check: process(clk)
begin
if rising_edge(clk) then
if w_in.write_enable = '1' then
assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure;
end if;
end if;
end process x_state_check;

-- Forward any written data
register_read_0: process(all)
begin
d_out.read1_data <= d1;
d_out.read2_data <= d2;
d_out.read3_data <= d3;

if w_in.write_enable = '1' then
if d_in.read1_reg = w_in.write_reg then
d_out.read1_data <= w_in.write_data;
end if;
if d_in.read2_reg = w_in.write_reg then
d_out.read2_data <= w_in.write_data;
end if;
if d_in.read3_reg = w_in.write_reg then
d_out.read3_data <= w_in.write_data;
end if;
end if;
end process register_read_0;

end architecture behaviour;

285
asic/top-asic.vhdl

@ -0,0 +1,285 @@ @@ -0,0 +1,285 @@
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

library work;
use work.wishbone_types.all;

entity toplevel is
generic (
MEMORY_SIZE : integer := 8192;
RAM_INIT_FILE : string := "firmware.hex";
RESET_LOW : boolean := true;
CLK_INPUT : positive := 100000000;
CLK_FREQUENCY : positive := 100000000;
HAS_FPU : boolean := true;
HAS_BTC : boolean := false;
NO_BRAM : boolean := false;
DISABLE_FLATTEN_CORE : boolean := false;
ALT_RESET_ADDRESS : std_logic_vector(63 downto 0) := (27 downto 0 => '0', others => '1');
SPI_FLASH_OFFSET : integer := 0;
SPI_FLASH_DEF_CKDV : natural := 4;
SPI_FLASH_DEF_QUAD : boolean := false;
SPI_BOOT_CLOCKS : boolean := false;
LOG_LENGTH : natural := 0;
UART_IS_16550 : boolean := true;
HAS_UART1 : boolean := false;
HAS_JTAG : boolean := true;
ICACHE_NUM_LINES : natural := 4;
ICACHE_NUM_WAYS : natural := 1;
ICACHE_TLB_SIZE : natural := 4;
DCACHE_NUM_LINES : natural := 4;
DCACHE_NUM_WAYS : natural := 1;
DCACHE_TLB_SET_SIZE : natural := 2;
DCACHE_TLB_NUM_WAYS : natural := 2;
HAS_GPIO : boolean := true;
NGPIO : natural := 32
);
port(
ext_clk : in std_ulogic;
ext_rst : in std_ulogic;

-- UART0 signals:
uart0_txd : out std_ulogic;
uart0_rxd : in std_ulogic;

-- SPI
spi_flash_cs_n : out std_ulogic;
spi_flash_clk : out std_ulogic;
spi_flash_sdat_i : in std_ulogic_vector(3 downto 0);
spi_flash_sdat_o : out std_ulogic_vector(3 downto 0);
spi_flash_sdat_oe : out std_ulogic_vector(3 downto 0);

-- GPIO
gpio_in : in std_ulogic_vector(NGPIO - 1 downto 0);
gpio_out : out std_ulogic_vector(NGPIO - 1 downto 0);
gpio_dir : out std_ulogic_vector(NGPIO - 1 downto 0);

-- JTAG signals:
jtag_tck : in std_ulogic;
jtag_tdi : in std_ulogic;
jtag_tms : in std_ulogic;
jtag_trst : in std_ulogic;
jtag_tdo : out std_ulogic;

-- simplebus
simplebus_clk : out std_logic;
simplebus_bus_out : out std_logic_vector(7 downto 0);
simplebus_parity_out : out std_logic;
simplebus_bus_in : in std_logic_vector(7 downto 0);
simplebus_parity_in : in std_logic;
simplebus_enabled : out std_logic;
simplebus_irq : in std_ulogic;

-- Add an I/O pin to select fetching from flash on reset
alt_reset : in std_ulogic
);
end entity toplevel;

architecture behaviour of toplevel is
-- reset signals
signal system_rst : std_ulogic;

-- simplebus wishbone connection
signal wb_simplebus_out : wishbone_master_out;
signal wb_simplebus_in : wishbone_slave_out;

-- simplebus split out wishbone
signal wb_simplebus_adr : wishbone_addr_type;
signal wb_simplebus_dat_o : wishbone_data_type;
signal wb_simplebus_cyc : std_ulogic;
signal wb_simplebus_stb : std_ulogic;
signal wb_simplebus_sel : wishbone_sel_type;
signal wb_simplebus_we : std_ulogic;
signal wb_simplebus_dat_i : wishbone_data_type;
signal wb_simplebus_ack : std_ulogic;
signal wb_simplebus_stall : std_ulogic;

-- simplebus I/O wishbone
signal wb_ext_io_in : wb_io_master_out;
signal wb_ext_io_out : wb_io_slave_out;
signal wb_ext_is_simplebus : std_ulogic;

-- simplebus I/O split out wishbone
signal wb_simplebus_ctrl_adr : std_ulogic_vector(29 downto 0);
signal wb_simplebus_ctrl_dat_o : std_ulogic_vector(31 downto 0);
signal wb_simplebus_ctrl_cyc : std_ulogic;
signal wb_simplebus_ctrl_stb : std_ulogic;
signal wb_simplebus_ctrl_sel : std_ulogic_vector(3 downto 0);
signal wb_simplebus_ctrl_we : std_ulogic;
signal wb_simplebus_ctrl_dat_i : std_ulogic_vector(31 downto 0);
signal wb_simplebus_ctrl_ack : std_ulogic;
signal wb_simplebus_ctrl_stall : std_ulogic;

component simplebus_host port(
clk : in std_logic;
rst : in std_logic;

wb_cyc : in std_logic;
wb_stb : in std_logic;
wb_we : in std_logic;
wb_adr : in wishbone_addr_type;
wb_dat_w : in wishbone_data_type;
wb_sel : in std_logic_vector;
wb_ack : out std_logic;
wb_stall : out std_logic;
wb_dat_r : out wishbone_data_type;

wb_ctrl_cyc : in std_logic;
wb_ctrl_stb : in std_logic;
wb_ctrl_we : in std_logic;
wb_ctrl_adr : in std_logic_vector(29 downto 0);
wb_ctrl_dat_w : in std_logic_vector(31 downto 0);
wb_ctrl_sel : in std_logic_vector(3 downto 0);
wb_ctrl_ack : out std_logic;
wb_ctrl_stall : out std_logic;
wb_ctrl_dat_r : out std_logic_vector(31 downto 0);

clk_out : out std_logic;
bus_out : out std_logic_vector(7 downto 0);
parity_out : out std_logic;
bus_in : in std_logic_vector(7 downto 0);
parity_in : in std_logic;
enabled : out std_logic
);
end component simplebus_host;

begin

system_rst <= not ext_rst when RESET_LOW else ext_rst;

-- Main SoC
soc0: entity work.soc
generic map(
MEMORY_SIZE => MEMORY_SIZE,
RAM_INIT_FILE => RAM_INIT_FILE,
SIM => false,
CLK_FREQ => CLK_FREQUENCY,
HAS_FPU => HAS_FPU,
HAS_BTC => HAS_BTC,
HAS_DRAM => true,
DRAM_SIZE => 0,
DRAM_INIT_SIZE => 0,
DISABLE_FLATTEN_CORE => DISABLE_FLATTEN_CORE,
ALT_RESET_ADDRESS => ALT_RESET_ADDRESS,
HAS_SPI_FLASH => true,
SPI_FLASH_DLINES => 4,
SPI_FLASH_OFFSET => SPI_FLASH_OFFSET,
SPI_FLASH_DEF_CKDV => SPI_FLASH_DEF_CKDV,
SPI_FLASH_DEF_QUAD => SPI_FLASH_DEF_QUAD,
SPI_BOOT_CLOCKS => SPI_BOOT_CLOCKS,
LOG_LENGTH => LOG_LENGTH,
UART0_IS_16550 => UART_IS_16550,
HAS_UART1 => HAS_UART1,
HAS_GPIO => HAS_GPIO,
NGPIO => NGPIO,
HAS_JTAG => HAS_JTAG,
ICACHE_NUM_LINES => ICACHE_NUM_LINES,
ICACHE_NUM_WAYS => ICACHE_NUM_WAYS,
ICACHE_TLB_SIZE => ICACHE_TLB_SIZE,
DCACHE_NUM_LINES => DCACHE_NUM_LINES,
DCACHE_NUM_WAYS => DCACHE_NUM_WAYS,
DCACHE_TLB_SET_SIZE => DCACHE_TLB_SET_SIZE,
DCACHE_TLB_NUM_WAYS => DCACHE_TLB_NUM_WAYS
)
port map (
-- System signals
system_clk => ext_clk,
rst => system_rst,

-- UART signals
uart0_txd => uart0_txd,
uart0_rxd => uart0_rxd,

-- SPI signals
spi_flash_sck => spi_flash_clk,
spi_flash_cs_n => spi_flash_cs_n,
spi_flash_sdat_o => spi_flash_sdat_o,
spi_flash_sdat_oe => spi_flash_sdat_oe,
spi_flash_sdat_i => spi_flash_sdat_i,

-- GPIO signals
gpio_in => gpio_in,
gpio_out => gpio_out,
gpio_dir => gpio_dir,

-- JTAG signals
jtag_tck => jtag_tck,
jtag_tdi => jtag_tdi,
jtag_tms => jtag_tms,
jtag_trst => jtag_trst,
jtag_tdo => jtag_tdo,

-- simplebus 64-bit wishbone
wb_dram_in => wb_simplebus_out,
wb_dram_out => wb_simplebus_in,

-- simplebus 32-bit external IO wishbone
wb_ext_io_in => wb_ext_io_in,
wb_ext_io_out => wb_ext_io_out,
wb_ext_is_dram_csr => wb_ext_is_simplebus,

ext_irq_eth => simplebus_irq,

-- Reset PC to flash offset 0 (ie 0xf000000)
alt_reset => alt_reset
);

-- simplebus wishbone
wb_simplebus_adr <= wb_simplebus_out.adr;
wb_simplebus_dat_o <= wb_simplebus_out.dat;
wb_simplebus_cyc <= wb_simplebus_out.cyc;
wb_simplebus_stb <= wb_simplebus_out.stb;
wb_simplebus_sel <= wb_simplebus_out.sel;
wb_simplebus_we <= wb_simplebus_out.we;

wb_simplebus_in.dat <= wb_simplebus_dat_i;
wb_simplebus_in.ack <= wb_simplebus_ack;
wb_simplebus_in.stall <= wb_simplebus_stall;

-- simplebus I/O wishbone
wb_simplebus_ctrl_adr <= wb_ext_io_in.adr;
wb_simplebus_ctrl_dat_o <= wb_ext_io_in.dat;
wb_simplebus_ctrl_cyc <= wb_ext_io_in.cyc and wb_ext_is_simplebus;
wb_simplebus_ctrl_stb <= wb_ext_io_in.stb and wb_ext_is_simplebus;
wb_simplebus_ctrl_sel <= wb_ext_io_in.sel;
wb_simplebus_ctrl_we <= wb_ext_io_in.we;

wb_ext_io_out.dat <= wb_simplebus_ctrl_dat_i;
wb_ext_io_out.ack <= wb_simplebus_ctrl_ack;
wb_ext_io_out.stall <= wb_simplebus_ctrl_stall;

simplebus_0: simplebus_host
port map(
clk => ext_clk,
rst => system_rst,

wb_cyc => wb_simplebus_cyc,
wb_stb => wb_simplebus_stb,
wb_we => wb_simplebus_we,
wb_adr => wb_simplebus_adr,
wb_dat_w => wb_simplebus_dat_o,
wb_sel => wb_simplebus_sel,
wb_ack => wb_simplebus_ack,
wb_stall => wb_simplebus_stall,
wb_dat_r => wb_simplebus_dat_i,

wb_ctrl_cyc => wb_simplebus_ctrl_cyc,
wb_ctrl_stb => wb_simplebus_ctrl_stb,
wb_ctrl_we => wb_simplebus_ctrl_we,
wb_ctrl_adr => wb_simplebus_ctrl_adr,
wb_ctrl_dat_w => wb_simplebus_ctrl_dat_o,
wb_ctrl_sel => wb_simplebus_ctrl_sel,
wb_ctrl_ack => wb_simplebus_ctrl_ack,
wb_ctrl_stall => wb_simplebus_ctrl_stall,
wb_ctrl_dat_r => wb_simplebus_ctrl_dat_i,

clk_out => simplebus_clk,
bus_out => simplebus_bus_out,
parity_out => simplebus_parity_out,
bus_in => simplebus_bus_in,
parity_in => simplebus_parity_in,
enabled => simplebus_enabled
);

end architecture behaviour;

72
caravel/insert_power.py

@ -0,0 +1,72 @@ @@ -0,0 +1,72 @@
#!/usr/bin/python

import argparse
import re

module_regex = r'[a-zA-Z0-9_:\.\\]+'

# match:
# module dcache(clk, rst, d_in, m_in, wishbone_in, d_out, m_out, stall_out, wishbone_out);
# A bit of a hack - ignore anything contining a '`', and assume that means we've already
# processed this module in a previous run. This helps when having to run this script
# multiple times for different power names.
multiline_module_re = re.compile(r'module\s+(' + module_regex + r')\(([^`]*?)\);', re.DOTALL)
module_re = re.compile(r'module\s+(' + module_regex + r')\((.*?)\);')

# match:
# dcache_64_2_2_2_2_12_0 dcache_0 (
hookup_re = re.compile(r'\s+(' + module_regex + r') ' + module_regex + r'\s+\(')

header1 = """\
`ifdef USE_POWER_PINS
{power}, {ground},
`endif\
"""

header2 = """\
`ifdef USE_POWER_PINS
inout {power};
inout {ground};
`endif\
"""

header3 = """\
`ifdef USE_POWER_PINS
.{power}({parent_power}),
.{ground}({parent_ground}),
`endif\
"""

parser = argparse.ArgumentParser(description='Insert power and ground into verilog modules')
parser.add_argument('--power', default='VPWR', help='POWER net name (default VPWR)')
parser.add_argument('--ground', default='VGND', help='POWER net name (default VGND)')
parser.add_argument('--parent-power', default='VPWR', help='POWER net name of parent module (default VPWR)')
parser.add_argument('--parent-ground', default='VGND', help='POWER net name of parent module (default VGND)')
parser.add_argument('--verilog', required=True, help='Verilog file to modify')
parser.add_argument('--module', required=True, action='append', help='Module to replace (can be specified multiple times')

args = parser.parse_args()

with open(args.verilog, 'r') as f:
d = f.read()
# Remove newlines from module definitions, yosys started doing this as of
# commit ff8e999a7112 ("Split module ports, 20 per line")
fixed = multiline_module_re.sub(lambda m: m.group(0).replace("\n", ""), d)

for line in fixed.splitlines():
m = module_re.match(line)
m2 = hookup_re.match(line)
if m and m.group(1) in args.module:
module_name = m.group(1)
module_args = m.group(2)
print('module %s(' % (module_name))
print("")
print(header1.format(power=args.power, ground=args.ground))
print(' %s);' % module_args)
print(header2.format(power=args.power, ground=args.ground))
elif m2 and m2.group(1) in args.module:
print(line)
print(header3.format(parent_power=args.parent_power, parent_ground=args.parent_ground, power=args.power, ground=args.ground))
else:
print(line)

37
caravel/process-microwatt-verilog.sh

@ -0,0 +1,37 @@ @@ -0,0 +1,37 @@
#!/bin/bash -e

# process microwatt verilog

FILE_IN=microwatt_asic.v
FILE_OUT=microwatt_asic_processed.v

# Rename top level
sed 's/toplevel/microwatt/' < $FILE_IN > $FILE_OUT

# Add power to all macros, and route power in microwatt down to them
caravel/insert_power.py --verilog=$FILE_OUT --parent-power=vccd1 --parent-ground=vssd1 --power=vccd1 --ground=vssd1 --module=microwatt --module=core_0_4_1_4_4_1_2_2_452bf2882a9b5f1c06340d5059c72dbd8af3bf8b --module=execute1_0_47ec8d98366433dc002e7721c9e37d5067547937 --module=multiply_2 --module=soc_4096_100000000_0_0_4_0_4_0_4_1_4_4_1_2_2_32_529beb193518cdd5546a21170d32ebafc9f9cb89 --module=icache_64_8_4_1_4_12_0_5ba93c9db0cff93f52b521d7420e43f6eda2784f --module=dcache_64_4_1_2_2_12_0 --module=cache_ram_5_64_1489f923c4dca729178b3e3233458550d8dddf29 --module=main_bram_64_9_4096_a75adb9e07879fb6c63b494abe06e3f9a6bb2ed9 --module=register_file_0_3f29546453678b855931c174a97d6c0894b8f546 --module=wishbone_bram_wrapper_4096_a75adb9e07879fb6c63b494abe06e3f9a6bb2ed9 --module=fpu > ${FILE_OUT}.tmp1

# Hard macros use VPWR/VGND
caravel/insert_power.py --verilog=${FILE_OUT}.tmp1 --parent-power=vccd1 --parent-ground=vssd1 --power=VPWR --ground=VGND --module=Microwatt_FP_DFFRFile --module=multiply_add_64x64 --module=RAM32_1RW1R --module=RAM512 > ${FILE_OUT}.tmp2

mv ${FILE_OUT}.tmp2 ${FILE_OUT}
rm ${FILE_OUT}.tmp1

# Add defines
sed -i '1 a\
\
/* JTAG */\
`include "tap_top.v"\
\
/* UART */\
`include "raminfr.v"\
`include "uart_receiver.v"\
`include "uart_rfifo.v"\
`include "uart_tfifo.v"\
`include "uart_transmitter.v"\
`include "uart_defines.v"\
`include "uart_regs.v"\
`include "uart_sync_flops.v"\
`include "uart_wb.v"\
`include "uart_top.v"\
`include "simplebus_host.v"' $FILE_OUT

2
common.vhdl

@ -8,7 +8,7 @@ use work.decode_types.all; @@ -8,7 +8,7 @@ use work.decode_types.all;

package common is
-- Processor Version Number
constant PVR_MICROWATT : std_ulogic_vector(31 downto 0) := x"00630000";
constant PVR_MICROWATT : std_ulogic_vector(31 downto 0) := x"00630101";

-- MSR bit numbers
constant MSR_SF : integer := (63 - 0); -- Sixty-Four bit mode

6
control.vhdl

@ -64,8 +64,8 @@ architecture rtl of control is @@ -64,8 +64,8 @@ architecture rtl of control is

signal r_int, rin_int : reg_internal_type := reg_internal_init;

signal gpr_write_valid : std_ulogic;
signal cr_write_valid : std_ulogic;
signal gpr_write_valid : std_ulogic := '0';
signal cr_write_valid : std_ulogic := '0';

type tag_register is record
wr_gpr : std_ulogic;
@ -245,8 +245,6 @@ begin @@ -245,8 +245,6 @@ begin
end if;

if rst = '1' then
gpr_write_valid <= '0';
cr_write_valid <= '0';
v_int := reg_internal_init;
valid_tmp := '0';
end if;

23
core.vhdl

@ -117,20 +117,21 @@ architecture behave of core is @@ -117,20 +117,21 @@ architecture behave of core is
signal complete: instr_tag_t;
signal terminate: std_ulogic;
signal core_rst: std_ulogic;
signal icache_inv: std_ulogic;
signal do_interrupt: std_ulogic;

-- Delayed/Latched resets and alt_reset
signal rst_fetch1 : std_ulogic;
signal rst_fetch2 : std_ulogic;
signal rst_icache : std_ulogic;
signal rst_dcache : std_ulogic;
signal rst_dec1 : std_ulogic;
signal rst_dec2 : std_ulogic;
signal rst_ex1 : std_ulogic;
signal rst_fpu : std_ulogic;
signal rst_ls1 : std_ulogic;
signal rst_wback : std_ulogic;
signal rst_dbg : std_ulogic;
signal rst_fetch1 : std_ulogic := '1';
signal rst_fetch2 : std_ulogic := '1';
signal rst_icache : std_ulogic := '1';
signal rst_dcache : std_ulogic := '1';
signal rst_dec1 : std_ulogic := '1';
signal rst_dec2 : std_ulogic := '1';
signal rst_ex1 : std_ulogic := '1';
signal rst_fpu : std_ulogic := '1';
signal rst_ls1 : std_ulogic := '1';
signal rst_wback : std_ulogic := '1';
signal rst_dbg : std_ulogic := '1';
signal alt_reset_d : std_ulogic;

signal sim_cr_dump: std_ulogic;

1
core_debug.vhdl

@ -154,7 +154,6 @@ begin @@ -154,7 +154,6 @@ begin
stopping <= '0';
terminated <= '0';
log_trigger_delay <= 0;
gspr_index <= (others => '0');
else
if do_log_trigger = '1' or log_trigger_delay /= 0 then
if log_trigger_delay = 255 then

1
dcache.vhdl

@ -1121,6 +1121,7 @@ begin @@ -1121,6 +1121,7 @@ begin
rams: for i in 0 to NUM_WAYS-1 generate
signal do_read : std_ulogic;
signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
signal do_write : std_ulogic;
signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
signal wr_data : std_ulogic_vector(wishbone_data_bits-1 downto 0);
signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);

2
divider.vhdl

@ -42,8 +42,6 @@ begin @@ -42,8 +42,6 @@ begin
quot <= (others => '0');
running <= '0';
count <= "0000000";
is_32bit <= '0';
overflow <= '0';
elsif d_in.valid = '1' then
if d_in.is_extended = '1' then
dend <= '0' & d_in.dividend & x"0000000000000000";

302
dmi_dtm_jtag.vhdl

@ -0,0 +1,302 @@ @@ -0,0 +1,302 @@
-- JTAG to DMI interface, based on the Xilinx version
--
-- DMI bus
--
-- req : ____/------------\_____
-- addr: xxxx< >xxxxx, based on the Xilinx version
-- dout: xxxx< >xxxxx
-- wr : xxxx< >xxxxx
-- din : xxxxxxxxxxxx< >xxx
-- ack : ____________/------\___
--
-- * addr/dout set along with req, can be latched on same cycle by slave
-- * ack & din remain up until req is dropped by master, the slave must
-- provide a stable output on din on reads during that time.
-- * req remains low at until at least one sysclk after ack seen down.
--
-- JTAG (tck) DMI (sys_clk)
--
-- * jtag_req = 1
-- (jtag_req_0) *
-- (jtag_req_1) -> * dmi_req = 1 >
-- *.../...
-- * dmi_ack = 1 <
-- * (dmi_ack_0)
-- * <- (dmi_ack_1)
-- * jtag_req = 0 (and latch dmi_din)
-- (jtag_req_0) *
-- (jtag_req_1) -> * dmi_req = 0 >
-- * dmi_ack = 0 <
-- * (dmi_ack_0)
-- * <- (dmi_ack_1)
--
-- jtag_req can go back to 1 when jtag_rsp_1 is 0
--
-- Questions/TODO:
-- - I use 2 flip fops for sync, is that enough ?
-- - I treat the jtag_trst as an async reset, is that necessary ?
-- - Dbl check reset situation since we have two different resets
-- each only resetting part of the logic...
-- - Look at optionally removing the synchronizer on the ack path,
-- assuming JTAG is always slow enough that ack will have been
-- stable long enough by the time CAPTURE comes in.
-- - We could avoid the latched request by not shifting while a
-- request is in progress (and force TDO to 1 to return a busy
-- status).
--
-- WARNING: This isn't the real DMI JTAG protocol (at least not yet).
-- a command while busy will be ignored. A response of "11"
-- means the previous command is still going, try again.
-- As such We don't implement the DMI "error" status, and
-- we don't implement DTMCS yet... This may still all change
-- but for now it's easier that way as the real DMI protocol
-- requires for a command to work properly that enough TCK
-- are sent while IDLE and I'm having trouble getting that
-- working with UrJtag and the Xilinx BSCAN2 for now.

library ieee;
use ieee.std_logic_1164.all;
use ieee.math_real.all;

library work;
use work.wishbone_types.all;

entity dmi_dtm_jtag is
generic(ABITS : INTEGER:=8;
DBITS : INTEGER:=32);

port(sys_clk : in std_ulogic;
sys_reset : in std_ulogic;
dmi_addr : out std_ulogic_vector(ABITS - 1 downto 0);
dmi_din : in std_ulogic_vector(DBITS - 1 downto 0);
dmi_dout : out std_ulogic_vector(DBITS - 1 downto 0);
dmi_req : out std_ulogic;
dmi_wr : out std_ulogic;
dmi_ack : in std_ulogic;
-- dmi_err : in std_ulogic TODO: Add error response
jtag_tck : in std_ulogic;
jtag_tdi : in std_ulogic;
jtag_tms : in std_ulogic;
jtag_trst : in std_ulogic;
jtag_tdo : out std_ulogic
);
end entity dmi_dtm_jtag;

architecture behaviour of dmi_dtm_jtag is

-- Signals coming out of the JTAG TAP controller
signal capture : std_ulogic;
signal update : std_ulogic;
signal sel : std_ulogic;
signal shift : std_ulogic;
signal tdi : std_ulogic;
signal tdo : std_ulogic;

-- ** JTAG clock domain **

-- Shift register
signal shiftr : std_ulogic_vector(ABITS + DBITS + 1 downto 0);

-- Latched request
signal request : std_ulogic_vector(ABITS + DBITS + 1 downto 0);

-- A request is present
signal jtag_req : std_ulogic;

-- Synchronizer for jtag_rsp (sys clk -> jtag_tck)
signal dmi_ack_0 : std_ulogic;
signal dmi_ack_1 : std_ulogic;

-- ** sys clock domain **

-- Synchronizer for jtag_req (jtag clk -> sys clk)
signal jtag_req_0 : std_ulogic;
signal jtag_req_1 : std_ulogic;

-- ** combination signals
signal jtag_bsy : std_ulogic;
signal op_valid : std_ulogic;
signal rsp_op : std_ulogic_vector(1 downto 0);

-- ** Constants **
constant DMI_REQ_NOP : std_ulogic_vector(1 downto 0) := "00";
constant DMI_REQ_RD : std_ulogic_vector(1 downto 0) := "01";
constant DMI_REQ_WR : std_ulogic_vector(1 downto 0) := "10";
constant DMI_RSP_OK : std_ulogic_vector(1 downto 0) := "00";
constant DMI_RSP_BSY : std_ulogic_vector(1 downto 0) := "11";

attribute ASYNC_REG : string;
attribute ASYNC_REG of jtag_req_0: signal is "TRUE";
attribute ASYNC_REG of jtag_req_1: signal is "TRUE";
attribute ASYNC_REG of dmi_ack_0: signal is "TRUE";
attribute ASYNC_REG of dmi_ack_1: signal is "TRUE";

component tap_top port (
-- JTAG pads
tms_pad_i : in std_ulogic;
tck_pad_i : in std_ulogic;
trst_pad_i : in std_ulogic;
tdi_pad_i : in std_ulogic;
tdo_pad_o : out std_ulogic;
tdo_padoe_o : out std_ulogic;

-- TAP states
shift_dr_o : out std_ulogic;
pause_dr_o : out std_ulogic;
update_dr_o : out std_ulogic;
capture_dr_o : out std_ulogic;

-- Select signals for boundary scan or mbist
extest_select_o : out std_ulogic;
sample_preload_select_o : out std_ulogic;
mbist_select_o : out std_ulogic;
debug_select_o : out std_ulogic;

-- TDO signal that is connected to TDI of sub-modules.
tdo_o : out std_ulogic;

-- TDI signals from sub-modules
debug_tdi_i : in std_ulogic;
bs_chain_tdi_i : in std_ulogic;
mbist_tdi_i : in std_ulogic
);
end component;

begin
tap_top0 : tap_top
port map (
tms_pad_i => jtag_tms,
tck_pad_i => jtag_tck,
trst_pad_i => jtag_trst,
tdi_pad_i => jtag_tdi,
tdo_pad_o => jtag_tdo,
tdo_padoe_o => open, -- what to do with this?

shift_dr_o => shift,
pause_dr_o => open, -- what to do with this?
update_dr_o => update,
capture_dr_o => capture,

-- connect boundary scan and mbist?
extest_select_o => open,
sample_preload_select_o => open,
mbist_select_o => open,
debug_select_o => sel,

tdo_o => tdi,
debug_tdi_i => tdo,
bs_chain_tdi_i => '0',
mbist_tdi_i => '0'
);

-- dmi_req synchronization
dmi_req_sync : process(sys_clk)
begin
-- sys_reset is synchronous
if rising_edge(sys_clk) then
if (sys_reset = '1') then
jtag_req_0 <= '0';
jtag_req_1 <= '0';
else
jtag_req_0 <= jtag_req;
jtag_req_1 <= jtag_req_0;
end if;
end if;
end process;
dmi_req <= jtag_req_1;

-- dmi_ack synchronization
dmi_ack_sync: process(jtag_tck, jtag_trst)
begin
-- jtag_trst is async (see comments)
if jtag_trst = '1' then
dmi_ack_0 <= '0';
dmi_ack_1 <= '0';
elsif rising_edge(jtag_tck) then
dmi_ack_0 <= dmi_ack;
dmi_ack_1 <= dmi_ack_0;
end if;
end process;

-- jtag_bsy indicates whether we can start a new request, we can when
-- we aren't already processing one (jtag_req) and the synchronized ack
-- of the previous one is 0.
--
jtag_bsy <= jtag_req or dmi_ack_1;

-- decode request type in shift register
with shiftr(1 downto 0) select op_valid <=
'1' when DMI_REQ_RD,
'1' when DMI_REQ_WR,
'0' when others;

-- encode response op
rsp_op <= DMI_RSP_BSY when jtag_bsy = '1' else DMI_RSP_OK;

-- Some DMI out signals are directly driven from the request register
dmi_addr <= request(ABITS + DBITS + 1 downto DBITS + 2);
dmi_dout <= request(DBITS + 1 downto 2);
dmi_wr <= '1' when request(1 downto 0) = DMI_REQ_WR else '0';

-- TDO is wired to shift register bit 0
tdo <= shiftr(0);

-- Main state machine. Handles shift registers, request latch and
-- jtag_req latch. Could be split into 3 processes but it's probably
-- not worthwhile.
--
shifter: process(jtag_tck, jtag_trst, sys_reset)
begin
if jtag_trst = '1' or sys_reset = '1' then
shiftr <= (others => '0');
jtag_req <= '0';
request <= (others => '0');
elsif rising_edge(jtag_tck) then

-- Handle jtag "commands" when sel is 1
if sel = '1' then
-- Shift state, rotate the register
if shift = '1' then
shiftr <= tdi & shiftr(ABITS + DBITS + 1 downto 1);
end if;

-- Update state (trigger)
--
-- Latch the request if we aren't already processing one and
-- it has a valid command opcode.
--
if update = '1' and op_valid = '1' then
if jtag_bsy = '0' then
request <= shiftr;
jtag_req <= '1';
end if;
-- Set the shift register "op" to "busy". This will prevent
-- us from re-starting the command on the next update if
-- the command completes before that.
shiftr(1 downto 0) <= DMI_RSP_BSY;
end if;

-- Request completion.
--
-- Capture the response data for reads and clear request flag.
--
-- Note: We clear req (and thus dmi_req) here which relies on tck
-- ticking and sel set. This means we are stuck with dmi_req up if
-- the jtag interface stops. Slaves must be resilient to this.
--
if jtag_req = '1' and dmi_ack_1 = '1' then
jtag_req <= '0';
if request(1 downto 0) = DMI_REQ_RD then
request(DBITS + 1 downto 2) <= dmi_din;
end if;
end if;

-- Capture state, grab latch content with updated status
if capture = '1' then
shiftr <= request(ABITS + DBITS + 1 downto 2) & rsp_op;
end if;

end if;
end if;
end process;
end architecture behaviour;

7
execute1.vhdl

@ -99,8 +99,8 @@ architecture behaviour of execute1 is @@ -99,8 +99,8 @@ architecture behaviour of execute1 is
signal mshort_p : std_ulogic_vector(31 downto 0) := (others => '0');

signal valid_in : std_ulogic;
signal ctrl: ctrl_t;
signal ctrl_tmp: ctrl_t;
signal ctrl: ctrl_t := (others => (others => '0'));
signal ctrl_tmp: ctrl_t := (others => (others => '0'));
signal right_shift, rot_clear_left, rot_clear_right: std_ulogic;
signal rot_sign_ext: std_ulogic;
signal rotator_result: std_ulogic_vector(63 downto 0);
@ -113,6 +113,8 @@ architecture behaviour of execute1 is @@ -113,6 +113,8 @@ architecture behaviour of execute1 is
signal misc_result: std_ulogic_vector(63 downto 0);
signal muldiv_result: std_ulogic_vector(63 downto 0);
signal spr_result: std_ulogic_vector(63 downto 0);
signal result_mux_sel: std_ulogic_vector(2 downto 0);
<