Merge pull request #353 from tianrui-wei/master

fix: fix icache_tb not finishing correctly
Merge pull request #373 from antonblanchard/icache-insn-u-state
179 changed files with 115385 additions and 64631 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -5,12 +5,19 @@ on:
  pull_request:
  schedule:
    - cron: '0 0 * * 5'
+  workflow_dispatch:

 jobs:

  build:
    runs-on: ubuntu-latest
-    container: ghdl/vunit:llvm
+    strategy:
+      fail-fast: false
+      matrix:
+        backend:
+        - llvm
+        - gcc
+    container: ghdl/vunit:${{ matrix.backend }}
    steps:
    - uses: actions/checkout@v2
    - run: make GNATMAKE='gnatmake -j'$(nproc)
@ -33,7 +40,6 @@ jobs:
      max-parallel: 3
      matrix:
        task: [
-          "tests_unit",
          "tests_console",
          "{1..99}",
          "{100..199}",
@ -52,16 +58,24 @@ jobs:
    - uses: actions/checkout@v2
    - run: bash -c "make -j$(nproc) ${{ matrix.task }}"

+  VUnit:
+    needs: [build]
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - uses: docker://ghdl/vunit:llvm
+      with:
+        args: python3 ./run.py -p10
+
  symbiflow:
    strategy:
      fail-fast: false
      max-parallel: 2
      matrix:
-        task: [ ECP5-EVN, ORANGE-CRAB ]
+        task: [ ECP5-EVN, ORANGE-CRAB, ORANGE-CRAB-0.21 ]
    runs-on: ubuntu-latest
    env:
      DOCKER: 1
-      SYNTH_ECP5_FLAGS: -noflatten
      FPGA_TARGET: ${{matrix.task}}
    steps:
    - uses: actions/checkout@v2
@ -79,3 +93,17 @@ jobs:
    steps:
    - uses: actions/checkout@v2
    - run: make DOCKER=1 microwatt.v
+
+  verilator:
+    runs-on: ubuntu-latest
+    env:
+      DOCKER: 1
+      FPGA_TARGET: verilator
+      RAM_INIT_FILE: micropython/firmware.hex
+      MEMORY_SIZE: 524288
+    steps:
+    - uses: actions/checkout@v2
+    - run: |
+        sudo apt update
+        sudo apt install -y python3-pexpect
+        make -j$(nproc) test_micropython_verilator test_micropython_verilator_long
--- a/.gitignore
+++ b/.gitignore
@ -13,4 +13,5 @@ tests/*/*.hex
 tests/*/*.elf
 TAGS
 litedram/build/*
+liteeth/build/*
 obj_dir/*
--- a/152
+++ b/152
@ -1,12 +1,22 @@
 GHDL ?= ghdl
-GHDLFLAGS=--std=08 -frelaxed
+GHDLFLAGS=--std=08
 CFLAGS=-O3 -Wall
+# Need to investigate why yosys is hitting verilator warnings, and eventually turn on -Wall
+VERILATOR_FLAGS=-O3 -Wno-fatal -Wno-CASEOVERLAP -Wno-UNOPTFLAT #--trace
+# It takes forever to build with optimisation, so disable by default
+#VERILATOR_CFLAGS=-O3

-GHDLSYNTH ?= ghdl.so
+# some yosys builds have ghdl plugin built in, otherwise need "-m ghdl"
+GHDLSYNTH ?= $(shell ($(YOSYS) -H | grep -q ghdl) || echo -m ghdl)
 YOSYS     ?= yosys
 NEXTPNR   ?= nextpnr-ecp5
 ECPPACK   ?= ecppack
+ECPPROG   ?= ecpprog
 OPENOCD   ?= openocd
+VUNITRUN  ?= python3 ./run.py
+VERILATOR ?= verilator
+DFUUTIL   ?= dfu-util
+DFUSUFFIX ?= dfu-suffix

 # We need a version of GHDL built with either the LLVM or gcc backend.
 # Fedora provides this, but other distros may not. Another option is to use
@ -29,37 +39,41 @@ PWD = $(shell pwd)
 DOCKERARGS = run --rm -v $(PWD):/src:z -w /src
 GHDL      = $(DOCKERBIN) $(DOCKERARGS) ghdl/ghdl:buster-llvm-7 ghdl
 CC        = $(DOCKERBIN) $(DOCKERARGS) ghdl/ghdl:buster-llvm-7 gcc
-GHDLSYNTH = ghdl
+GHDLSYNTH = -m ghdl
 YOSYS     = $(DOCKERBIN) $(DOCKERARGS) hdlc/ghdl:yosys yosys
 NEXTPNR   = $(DOCKERBIN) $(DOCKERARGS) hdlc/nextpnr:ecp5 nextpnr-ecp5
 ECPPACK   = $(DOCKERBIN) $(DOCKERARGS) hdlc/prjtrellis ecppack
 OPENOCD   = $(DOCKERBIN) $(DOCKERARGS) --device /dev/bus/usb hdlc/prog openocd
+VUNITRUN  = $(DOCKERBIN) $(DOCKERARGS) ghdl/vunit:llvm python3 ./run.py
+VERILATOR = $(DOCKERBIN) $(DOCKERARGS) verilator/verilator:latest
 endif

-all = core_tb icache_tb dcache_tb multiply_tb dmi_dtm_tb divider_tb \
-	rotator_tb countzero_tb wishbone_bram_tb soc_reset_tb
+VUNITARGS += -p10
+
+all = core_tb icache_tb dcache_tb dmi_dtm_tb \
+	wishbone_bram_tb soc_reset_tb

 all: $(all)

 core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \
 	utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl \
-	decode1.vhdl helpers.vhdl insn_helpers.vhdl gpr_hazard.vhdl \
-	cr_hazard.vhdl control.vhdl decode2.vhdl register_file.vhdl \
+	decode1.vhdl helpers.vhdl insn_helpers.vhdl \
+	control.vhdl decode2.vhdl register_file.vhdl \
 	cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \
-	logical.vhdl countzero.vhdl multiply.vhdl divider.vhdl execute1.vhdl \
+	logical.vhdl countbits.vhdl multiply.vhdl divider.vhdl execute1.vhdl \
 	loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl core_debug.vhdl \
-	core.vhdl fpu.vhdl
+	core.vhdl fpu.vhdl pmu.vhdl

-soc_files = $(core_files) wishbone_arbiter.vhdl wishbone_bram_wrapper.vhdl sync_fifo.vhdl \
-	wishbone_debug_master.vhdl xics.vhdl syscon.vhdl soc.vhdl \
+soc_files = wishbone_arbiter.vhdl wishbone_bram_wrapper.vhdl sync_fifo.vhdl \
+	wishbone_debug_master.vhdl xics.vhdl syscon.vhdl gpio.vhdl soc.vhdl \
 	spi_rxtx.vhdl spi_flash_ctrl.vhdl

 uart_files = $(wildcard uart16550/*.v)

-soc_sim_files = $(soc_files) sim_console.vhdl sim_pp_uart.vhdl sim_bram_helpers.vhdl \
+soc_sim_files = $(core_files) $(soc_files) sim_console.vhdl sim_pp_uart.vhdl sim_bram_helpers.vhdl \
 	sim_bram.vhdl sim_jtag_socket.vhdl sim_jtag.vhdl dmi_dtm_xilinx.vhdl \
 	sim_16550_uart.vhdl \
-	random.vhdl glibc_random.vhdl glibc_random_helpers.vhdl
+	foreign_random.vhdl glibc_random.vhdl glibc_random_helpers.vhdl

 soc_sim_c_files = sim_vhpi_c.c sim_bram_helpers_c.c sim_console_c.c \
 	sim_jtag_socket_c.c
@ -76,7 +90,6 @@ $(unisim_lib): $(unisim_lib_files)
 	$(GHDL) -i --std=08 --work=unisim --workdir=$(unisim_dir) $^
 GHDLFLAGS += -P$(unisim_dir)

-core_tbs = multiply_tb divider_tb rotator_tb countzero_tb
 soc_tbs = core_tb icache_tb dcache_tb dmi_dtm_tb wishbone_bram_tb
 soc_flash_tbs = core_flash_tb
 soc_dram_tbs = dram_tb core_dram_tb
@ -102,9 +115,6 @@ $(soc_flash_tbs): %: $(soc_sim_files) $(soc_sim_obj_files) $(unisim_lib) $(fmf_l
 $(soc_tbs): %: $(soc_sim_files) $(soc_sim_obj_files) $(unisim_lib) %.vhdl
 	$(GHDL) -c $(GHDLFLAGS) $(soc_sim_link) $(soc_sim_files) $@.vhdl -e $@

-$(core_tbs): %: $(core_files) glibc_random.vhdl glibc_random_helpers.vhdl %.vhdl
-	$(GHDL) -c $(GHDLFLAGS) $(core_files) glibc_random.vhdl glibc_random_helpers.vhdl $@.vhdl -e $@
-
 soc_reset_tb: fpga/soc_reset_tb.vhdl fpga/soc_reset.vhdl
 	$(GHDL) -c $(GHDLFLAGS) fpga/soc_reset_tb.vhdl fpga/soc_reset.vhdl -e $@

@ -115,10 +125,8 @@ $(soc_dram_tbs):
 	$(error "Verilator is required to make this target !")
 else

-VERILATOR_CFLAGS=-O3
-VERILATOR_FLAGS=-O3
 verilated_dram: litedram/generated/sim/litedram_core.v
-	verilator $(VERILATOR_FLAGS) -CFLAGS $(VERILATOR_CFLAGS) -Wno-fatal --cc $< --trace
+	verilator $(VERILATOR_FLAGS) -CFLAGS $(VERILATOR_CFLAGS) -Wno-fatal --cc $<
 	make -C obj_dir -f ../litedram/extras/sim_dram_verilate.mk VERILATOR_ROOT=$(VERILATOR_ROOT)

 SIM_DRAM_CFLAGS  = -I. -Iobj_dir -Ilitedram/generated/sim -I$(VERILATOR_ROOT)/include -I$(VERILATOR_ROOT)/include/vltstd
@ -126,7 +134,7 @@ SIM_DRAM_CFLAGS += -DVM_COVERAGE=0 -DVM_SC=0 -DVM_TRACE=1 -DVL_PRINTF=printf -fa
 sim_litedram_c.o: litedram/extras/sim_litedram_c.cpp verilated_dram
 	$(CC)  $(CPPFLAGS) $(SIM_DRAM_CFLAGS) $(CFLAGS) -c $< -o $@

-soc_dram_files = $(soc_files) litedram/extras/litedram-wrapper-l2.vhdl litedram/generated/sim/litedram-initmem.vhdl
+soc_dram_files = $(core_files) $(soc_files) litedram/extras/litedram-wrapper-l2.vhdl litedram/generated/sim/litedram-initmem.vhdl
 soc_dram_sim_files = $(soc_sim_files) litedram/extras/sim_litedram.vhdl
 soc_dram_sim_obj_files = $(soc_sim_obj_files) sim_litedram_c.o
 dram_link_files=-Wl,obj_dir/Vlitedram_core__ALL.a -Wl,obj_dir/verilated.o -Wl,obj_dir/verilated_vcd_c.o -Wl,-lstdc++
@ -137,25 +145,54 @@ $(soc_dram_tbs): %: $(soc_dram_files) $(soc_dram_sim_files) $(soc_dram_sim_obj_f
 endif

 # Hello world
-MEMORY_SIZE=8192
-RAM_INIT_FILE=hello_world/hello_world.hex
+MEMORY_SIZE ?=8192
+RAM_INIT_FILE ?=hello_world/hello_world.hex

 # Micropython
 #MEMORY_SIZE=393216
 #RAM_INIT_FILE=micropython/firmware.hex

-FPGA_TARGET ?= ORANGE-CRAB
+FPGA_TARGET ?= ORANGE-CRAB-0.21
+
+# FIXME: icache RAMs aren't being inferrenced as block RAMs on ECP5
+# with yosys, so make it smaller for now as a workaround.
+ICACHE_NUM_LINES=4
+
+clkgen=fpga/clk_gen_ecp5.vhd
+toplevel=fpga/top-generic.vhdl
+dmi_dtm=dmi_dtm_dummy.vhdl
+LITEDRAM_GHDL_ARG=

-# OrangeCrab with ECP85
+# OrangeCrab with ECP85 (original v0.0 with UM5G-85 chip)
 ifeq ($(FPGA_TARGET), ORANGE-CRAB)
 RESET_LOW=true
-CLK_INPUT=50000000
-CLK_FREQUENCY=40000000
+CLK_INPUT=48000000
+CLK_FREQUENCY=48000000
 LPF=constraints/orange-crab.lpf
 PACKAGE=CSFBGA285
-NEXTPNR_FLAGS=--um5g-85k --freq 40
+NEXTPNR_FLAGS=--um5g-85k --freq 48
 OPENOCD_JTAG_CONFIG=openocd/olimex-arm-usb-tiny-h.cfg
 OPENOCD_DEVICE_CONFIG=openocd/LFE5UM5G-85F.cfg
+ECP_FLASH_OFFSET=0x80000
+endif
+
+# OrangeCrab with ECP85 (v0.21)
+ifeq ($(FPGA_TARGET), ORANGE-CRAB-0.21)
+RESET_LOW=true
+CLK_INPUT=48000000
+CLK_FREQUENCY=48000000
+LPF=constraints/orange-crab-0.2.lpf
+PACKAGE=CSFBGA285
+NEXTPNR_FLAGS=--85k --speed 8 --freq 48 --timing-allow-fail --ignore-loops
+OPENOCD_JTAG_CONFIG=openocd/olimex-arm-usb-tiny-h.cfg
+OPENOCD_DEVICE_CONFIG=openocd/LFE5U-85F.cfg
+DFU_VENDOR=1209
+DFU_PRODUCT=5af0
+ECP_FLASH_OFFSET=0x80000
+toplevel=fpga/top-orangecrab0.2.vhdl
+litedram_target=orangecrab-85-0.2
+soc_extra_v += litesdcard/generated/lattice/litesdcard_core.v
+dmi_dtm=dmi_dtm_ecp5.vhdl
 endif

 # ECP5-EVN
@ -170,12 +207,17 @@ OPENOCD_JTAG_CONFIG=openocd/ecp5-evn.cfg
 OPENOCD_DEVICE_CONFIG=openocd/LFE5UM5G-85F.cfg
 endif

+ifneq ($(litedram_target),)
+soc_extra_synth += litedram/extras/litedram-wrapper-l2.vhdl \
+	litedram/generated/$(litedram_target)/litedram-initmem.vhdl
+soc_extra_v += litedram/generated/$(litedram_target)/litedram_core.v
+LITEDRAM_GHDL_ARG=-gUSE_LITEDRAM=true
+endif
+
 GHDL_IMAGE_GENERICS=-gMEMORY_SIZE=$(MEMORY_SIZE) -gRAM_INIT_FILE=$(RAM_INIT_FILE) \
-	-gRESET_LOW=$(RESET_LOW) -gCLK_INPUT=$(CLK_INPUT) -gCLK_FREQUENCY=$(CLK_FREQUENCY)
+	-gRESET_LOW=$(RESET_LOW) -gCLK_INPUT=$(CLK_INPUT) -gCLK_FREQUENCY=$(CLK_FREQUENCY) -gICACHE_NUM_LINES=$(ICACHE_NUM_LINES) \
+	$(LITEDRAM_GHDL_ARG)

-clkgen=fpga/clk_gen_ecp5.vhd
-toplevel=fpga/top-generic.vhdl
-dmi_dtm=dmi_dtm_jtag.vhdl dmi_dtm_dummy.vhdl

 ifeq ($(FPGA_TARGET), verilator)
 RESET_LOW=true
@ -184,22 +226,20 @@ CLK_FREQUENCY=50000000
 clkgen=fpga/clk_gen_bypass.vhd
 endif

-fpga_files = $(core_files) $(soc_files) fpga/soc_reset.vhdl \
+fpga_files = fpga/soc_reset.vhdl \
 	fpga/pp_fifo.vhd fpga/pp_soc_uart.vhd fpga/main_bram.vhdl \
 	nonrandom.vhdl

-synth_files = $(core_files) $(soc_files) $(fpga_files) $(clkgen) $(toplevel) $(dmi_dtm)
+synth_files = $(core_files) $(soc_files) $(soc_extra_synth) $(fpga_files) $(clkgen) $(toplevel) $(dmi_dtm)

 microwatt.json: $(synth_files) $(RAM_INIT_FILE)
-	$(YOSYS) -m $(GHDLSYNTH) -p "ghdl --std=08 --no-formal $(GHDL_IMAGE_GENERICS) $(GHDL_TARGET_GENERICS) $(synth_files) -e toplevel; synth_ecp5 -json $@  $(SYNTH_ECP5_FLAGS)" $(uart_files)
+	$(YOSYS) $(GHDLSYNTH) -p "ghdl --std=08 --no-formal $(GHDL_IMAGE_GENERICS) $(synth_files) -e toplevel; read_verilog $(uart_files) $(soc_extra_v); synth_ecp5 -abc9 -nowidelut -json $@  $(SYNTH_ECP5_FLAGS)"

 microwatt.v: $(synth_files) $(RAM_INIT_FILE)
-	$(YOSYS) -m $(GHDLSYNTH) -p "ghdl --std=08 --no-formal $(GHDL_IMAGE_GENERICS) $(GHDL_TARGET_GENERICS) $(synth_files) -e toplevel; write_verilog $@"
+	$(YOSYS) $(GHDLSYNTH) -p "ghdl --std=08 --no-formal $(GHDL_IMAGE_GENERICS) $(synth_files) -e toplevel; write_verilog $@"

-# Need to investigate why yosys is hitting verilator warnings, and eventually turn on -Wall
-microwatt-verilator: microwatt.v verilator/microwatt-verilator.cpp verilator/uart-verilator.c verilator/jtag-verilator.c
-	verilator -O3 -CFLAGS "-DCLK_FREQUENCY=$(CLK_FREQUENCY)" --assert --cc microwatt.v --exe verilator/microwatt-verilator.cpp verilator/uart-verilator.c verilator/jtag-verilator.c -o $@ -Iuart16550 -Ijtag_tap -Wno-fatal -Wno-CASEOVERLAP -Wno-UNOPTFLAT --trace
-	make -C obj_dir -f Vmicrowatt.mk
+microwatt-verilator: microwatt.v verilator/microwatt-verilator.cpp verilator/uart-verilator.c
+	$(VERILATOR) $(VERILATOR_FLAGS) -CFLAGS "$(VERILATOR_CFLAGS) -DCLK_FREQUENCY=$(CLK_FREQUENCY)" -Iuart16550 --assert --cc --exe --build $^ -o $@ -top-module toplevel
 	@cp -f obj_dir/microwatt-verilator microwatt-verilator

 microwatt_out.config: microwatt.json $(LPF)
@ -207,18 +247,36 @@ microwatt_out.config: microwatt.json $(LPF)
 	mv -f $@.tmp $@

 microwatt.bit: microwatt_out.config
-	$(ECPPACK) --svf microwatt.svf $< $@
+	$(ECPPACK) --compress --freq 38.8 --svf microwatt.svf $< $@

 microwatt.svf: microwatt.bit

 prog: microwatt.svf
 	$(OPENOCD) -f $(OPENOCD_JTAG_CONFIG) -f $(OPENOCD_DEVICE_CONFIG) -c "transport select jtag; init; svf $<; exit"

+microwatt.dfu: microwatt.bit
+	cp $< $@.tmp
+	$(DFUSUFFIX) -v $(DFU_VENDOR) -p $(DFU_PRODUCT) -a $@.tmp
+	mv $@.tmp $@
+
+dfuprog: microwatt.dfu
+	$(DFUUTIL) -a 0 -D $<
+
+ecpprog: microwatt.bit
+	$(ECPPROG) -S $<
+
+ecpflash: microwatt.bit
+	test -n "$(ECP_FLASH_OFFSET)" || (echo Error: No ECP_FLASH_OFFSET defined for target; exit 1)
+	$(ECPPROG) -o $(ECP_FLASH_OFFSET) $<
+
 tests = $(sort $(patsubst tests/%.out,%,$(wildcard tests/*.out)))
 tests_console = $(sort $(patsubst tests/%.console_out,%,$(wildcard tests/*.console_out)))

 tests_console: $(tests_console)

+check_vunit:
+	$(VUNITRUN) $(VUNITARGS)
+
 check: $(tests) tests_console test_micropython test_micropython_long tests_unit

 check_light: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 test_micropython test_micropython_long tests_console tests_unit
@ -232,22 +290,24 @@ $(tests_console): core_tb
 test_micropython: core_tb
 	@./scripts/test_micropython.py

+test_micropython_verilator: microwatt-verilator
+	@./scripts/test_micropython_verilator.py
+
 test_micropython_long: core_tb
 	@./scripts/test_micropython_long.py

-tests_core_tb = $(patsubst %_tb,%_tb_test,$(core_tbs))
+test_micropython_verilator_long: microwatt-verilator
+	@./scripts/test_micropython_verilator_long.py
+
 tests_soc_tb = $(patsubst %_tb,%_tb_test,$(soc_tbs))

 %_test: %
 	./$< --assert-level=error > /dev/null

-tests_core: $(tests_core_tb)
-
 tests_soc: $(tests_soc_tb)

 # FIXME SOC tests have bit rotted, so disable for now
-#tests_unit: tests_core tests_soc
-tests_unit: tests_core
+#tests_unit: tests_soc

 TAGS:
 	find . -name '*.vhdl' | xargs ./scripts/vhdltags
--- a/README.md
+++ b/README.md
@ -97,15 +97,19 @@ sudo dnf copr enable sharkcz/danny
 sudo dnf install fusesoc
 ```

- Create a working directory and point FuseSoC at microwatt:
+- If this is your first time using fusesoc, initialize fusesoc. 
+  This is needed to be able to pull down fussoc library components referenced 
+  by microwatt. Run

 ```
-mkdir microwatt-fusesoc
-cd microwatt-fusesoc
-fusesoc library add microwatt /path/to/microwatt/
+fusesoc init
+fusesoc fetch uart16550
+fusesoc library add microwatt /path/to/microwatt
 ```

 - Build using FuseSoC. For hello world (Replace nexys_video with your FPGA board such as --target=arty_a7-100):
+  You may wish to ensure you have [installed Digilent Board files](https://reference.digilentinc.com/vivado/installing-vivado/start#installing_digilent_board_files) 
+  or appropriate files for your board first.

 ```
 fusesoc run --target=nexys_video microwatt --memory_size=16384 --ram_init_file=/path/to/microwatt/fpga/hello_world.hex
@ -118,6 +122,68 @@ You should then be able to see output via the serial port of the board (/dev/tty
 fusesoc run --target=nexys_video microwatt
 ```

+## Linux on Microwatt
+
+Mainline Linux supports Microwatt as of v5.14. The Arty A7 is the best tested
+platform, but it's also been tested on the OrangeCrab and ButterStick.
+
+1. Use buildroot to create a userspace
+
+   A small change is required to glibc in order to support the VMX/AltiVec-less
+   Microwatt, as float128 support is mandiatory and for this in GCC requires
+   VSX/AltiVec. This change is included in Joel's buildroot fork, along with a
+   defconfig:
+   ```
+   git clone -b microwatt https://github.com/shenki/buildroot
+   cd buildroot
+   make ppc64le_microwatt_defconfig
+   make
+   ```
+
+   The output is `output/images/rootfs.cpio`.
+
+2. Build the Linux kernel
+   ```
+   git clone https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
+   cd linux
+   make ARCH=powerpc microwatt_defconfig
+   make ARCH=powerpc CROSS_COMPILE=powerpc64le-linux-gnu- \
+     CONFIG_INITRAMFS_SOURCE=/buildroot/output/images/rootfs.cpio -j`nproc`
+   ```
+
+   The output is `arch/powerpc/boot/dtbImage.microwatt.elf`.
+
+3. Build gateware using FuseSoC
+
+   First configure FuseSoC as above.
+   ```
+   fusesoc run --build --target=arty_a7-100 microwatt --no_bram --memory_size=0
+   ```
+
+   The output is `build/microwatt_0/arty_a7-100-vivado/microwatt_0.bit`.
+
+4. Program the flash
+
+   This operation will overwrite the contents of your flash.
+
+   For the Arty A7 A100, set `FLASH_ADDRESS` to `0x400000` and pass `-f a100`.
+
+   For the Arty A7 A35, set `FLASH_ADDRESS` to `0x300000` and pass `-f a35`.
+   ```
+   microwatt/openocd/flash-arty -f a100 build/microwatt_0/arty_a7-100-vivado/microwatt_0.bit
+   microwatt/openocd/flash-arty -f a100 dtbImage.microwatt.elf -t bin -a $FLASH_ADDRESS
+   ```
+
+5. Connect to the second USB TTY device exposed by the FPGA
+
+   ```
+   minicom -D /dev/ttyUSB1
+   ```
+
+   The gateware has firmware that will look at `FLASH_ADDRESS` and attempt to
+   parse an ELF there, loading it to the address specified in the ELF header
+   and jumping to it.
+
 ## Testing

 - A simple test suite containing random execution test cases and a couple of
@ -129,8 +195,5 @@ make -j$(nproc) check

 ## Issues

-This is functional, but very simple. We still have quite a lot to do:
-
- There are a few instructions still to be implemented
- Need to add caches and bypassing (in progress)
- Need to add supervisor state (in progress)
+- There are a few instructions still to be implemented:
+  - Vector/VMX/VSX
--- a/cache_ram.vhdl
+++ b/cache_ram.vhdl
@ -5,21 +5,21 @@ use ieee.math_real.all;

 entity cache_ram is
    generic(
-	ROW_BITS : integer := 16;
-	WIDTH    : integer := 64;
-	TRACE    : boolean := false;
-	ADD_BUF  : boolean := false
-	);
+        ROW_BITS : integer := 16;
+        WIDTH    : integer := 64;
+        TRACE    : boolean := false;
+        ADD_BUF  : boolean := false
+        );

    port(
-	clk     : in  std_logic;
-	rd_en   : in  std_logic;
-	rd_addr : in  std_logic_vector(ROW_BITS - 1 downto 0);
-	rd_data : out std_logic_vector(WIDTH - 1 downto 0);
-	wr_sel  : in  std_logic_vector(WIDTH/8 - 1 downto 0);
-	wr_addr : in  std_logic_vector(ROW_BITS - 1 downto 0);
-	wr_data : in  std_logic_vector(WIDTH - 1 downto 0)
-	);
+        clk     : in  std_logic;
+        rd_en   : in  std_logic;
+        rd_addr : in  std_logic_vector(ROW_BITS - 1 downto 0);
+        rd_data : out std_logic_vector(WIDTH - 1 downto 0);
+        wr_sel  : in  std_logic_vector(WIDTH/8 - 1 downto 0);
+        wr_addr : in  std_logic_vector(ROW_BITS - 1 downto 0);
+        wr_data : in  std_logic_vector(WIDTH - 1 downto 0)
+        );

 end cache_ram;

@ -35,13 +35,13 @@ architecture rtl of cache_ram is

 begin
    process(clk)
-	variable lbit : integer range 0 to WIDTH - 1;
-	variable mbit : integer range 0 to WIDTH - 1;
-	variable widx : integer range 0 to SIZE - 1;
-	constant sel0 : std_logic_vector(WIDTH/8 - 1 downto 0)
+        variable lbit : integer range 0 to WIDTH - 1;
+        variable mbit : integer range 0 to WIDTH - 1;
+        variable widx : integer range 0 to SIZE - 1;
+        constant sel0 : std_logic_vector(WIDTH/8 - 1 downto 0)
            := (others => '0');
    begin
-	if rising_edge(clk) then
+        if rising_edge(clk) then
            if TRACE then
                if wr_sel /= sel0 then
                    report "write a:" & to_hstring(wr_addr) &
@ -57,29 +57,29 @@ begin
                    ram(widx)(mbit downto lbit) <= wr_data(mbit downto lbit);
                end if;
            end loop;
-	    if rd_en = '1' then
-		rd_data0 <= ram(to_integer(unsigned(rd_addr)));
-		if TRACE then
-		    report "read a:" & to_hstring(rd_addr) &
-			" dat:" & to_hstring(ram(to_integer(unsigned(rd_addr))));
-		end if;
-	    end if;
-	end if;
+            if rd_en = '1' then
+                rd_data0 <= ram(to_integer(unsigned(rd_addr)));
+                if TRACE then
+                    report "read a:" & to_hstring(rd_addr) &
+                        " dat:" & to_hstring(ram(to_integer(unsigned(rd_addr))));
+                end if;
+            end if;
+        end if;
    end process;

    buf: if ADD_BUF generate
    begin
-	process(clk)
-	begin
-	    if rising_edge(clk) then
-		rd_data <= rd_data0;
-	    end if;
-	end process;
+        process(clk)
+        begin
+            if rising_edge(clk) then
+                rd_data <= rd_data0;
+            end if;
+        end process;
    end generate;

    nobuf: if not ADD_BUF generate
    begin
-	rd_data <= rd_data0;
+        rd_data <= rd_data0;
    end generate;

 end;
--- a/common.vhdl
+++ b/common.vhdl
@ -3,6 +3,7 @@ use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;

 library work;
+use work.utils.all;
 use work.decode_types.all;

 package common is
@ -20,6 +21,7 @@ package common is
    constant MSR_FE1 : integer := (63 - 55);    -- Floating Exception mode
    constant MSR_IR  : integer := (63 - 58);    -- Instruction Relocation
    constant MSR_DR  : integer := (63 - 59);    -- Data Relocation
+    constant MSR_PMM : integer := (63 - 61);    -- Performance Monitor Mark
    constant MSR_RI  : integer := (63 - 62);    -- Recoverable Interrupt
    constant MSR_LE  : integer := (63 - 63);    -- Little Endian

@ -50,9 +52,37 @@ package common is
    constant SPR_HSPRG0 : spr_num_t := 304;
    constant SPR_HSPRG1 : spr_num_t := 305;
    constant SPR_PID    : spr_num_t := 48;
-    constant SPR_PRTBL  : spr_num_t := 720;
+    constant SPR_PTCR   : spr_num_t := 464;
    constant SPR_PVR	: spr_num_t := 287;

+    -- PMU registers
+    constant SPR_UPMC1  : spr_num_t := 771;
+    constant SPR_UPMC2  : spr_num_t := 772;
+    constant SPR_UPMC3  : spr_num_t := 773;
+    constant SPR_UPMC4  : spr_num_t := 774;
+    constant SPR_UPMC5  : spr_num_t := 775;
+    constant SPR_UPMC6  : spr_num_t := 776;
+    constant SPR_UMMCR0 : spr_num_t := 779;
+    constant SPR_UMMCR1 : spr_num_t := 782;
+    constant SPR_UMMCR2 : spr_num_t := 769;
+    constant SPR_UMMCRA : spr_num_t := 770;
+    constant SPR_USIER  : spr_num_t := 768;
+    constant SPR_USIAR  : spr_num_t := 780;
+    constant SPR_USDAR  : spr_num_t := 781;
+    constant SPR_PMC1   : spr_num_t := 787;
+    constant SPR_PMC2   : spr_num_t := 788;
+    constant SPR_PMC3   : spr_num_t := 789;
+    constant SPR_PMC4   : spr_num_t := 790;
+    constant SPR_PMC5   : spr_num_t := 791;
+    constant SPR_PMC6   : spr_num_t := 792;
+    constant SPR_MMCR0  : spr_num_t := 795;
+    constant SPR_MMCR1  : spr_num_t := 798;
+    constant SPR_MMCR2  : spr_num_t := 785;
+    constant SPR_MMCRA  : spr_num_t := 786;
+    constant SPR_SIER   : spr_num_t := 784;
+    constant SPR_SIAR   : spr_num_t := 796;
+    constant SPR_SDAR   : spr_num_t := 797;
+
    -- GPR indices in the register file (GPR only)
    subtype gpr_index_t is std_ulogic_vector(4 downto 0);

@ -126,7 +156,25 @@ package common is
    constant FPSCR_NI     : integer := 63 - 61;
    constant FPSCR_RN     : integer := 63 - 63;

-    type irq_state_t is (WRITE_SRR0, WRITE_SRR1);
+    -- Real addresses
+    -- REAL_ADDR_BITS is the number of real address bits that we store
+    constant REAL_ADDR_BITS : positive := 56;
+    subtype real_addr_t is std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0);
+    function addr_to_real(addr: std_ulogic_vector(63 downto 0)) return real_addr_t;
+
+    -- Used for tracking instruction completion and pending register writes
+    constant TAG_COUNT : positive := 4;
+    constant TAG_NUMBER_BITS : natural := log2(TAG_COUNT);
+    subtype tag_number_t is integer range 0 to TAG_COUNT - 1;
+    subtype tag_index_t is unsigned(TAG_NUMBER_BITS - 1 downto 0);
+    type instr_tag_t is record
+        tag   : tag_number_t;
+        valid : std_ulogic;
+    end record;
+    constant instr_tag_init : instr_tag_t := (tag => 0, valid => '0');
+    function tag_match(tag1 : instr_tag_t; tag2 : instr_tag_t) return boolean;
+
+    subtype intr_vector_t is integer range 0 to 16#fff#;

    -- For now, fixed 16 sources, make this either a parametric
    -- package of some sort or an unconstrainted array.
@ -144,8 +192,6 @@ package common is
 	dec: std_ulogic_vector(63 downto 0);
 	msr: std_ulogic_vector(63 downto 0);
        cfar: std_ulogic_vector(63 downto 0);
-	irq_state : irq_state_t;
-	srr1: std_ulogic_vector(63 downto 0);
    end record;

    type Fetch1ToIcacheType is record
@ -154,7 +200,8 @@ package common is
        priv_mode : std_ulogic;
        big_endian : std_ulogic;
 	stop_mark: std_ulogic;
-        sequential: std_ulogic;
+        predicted : std_ulogic;
+        pred_ntaken : std_ulogic;
 	nia: std_ulogic_vector(63 downto 0);
    end record;

@ -164,6 +211,14 @@ package common is
        fetch_failed: std_ulogic;
 	nia: std_ulogic_vector(63 downto 0);
 	insn: std_ulogic_vector(31 downto 0);
+        big_endian: std_ulogic;
+        next_predicted: std_ulogic;
+        next_pred_ntaken: std_ulogic;
+    end record;
+
+    type IcacheEventType is record
+        icache_miss : std_ulogic;
+        itlb_miss_resolved : std_ulogic;
    end record;

    type Decode1ToDecode2Type is record
@ -173,44 +228,61 @@ package common is
 	insn: std_ulogic_vector(31 downto 0);
 	ispr1: gspr_index_t; -- (G)SPR used for branch condition (CTR) or mfspr
 	ispr2: gspr_index_t; -- (G)SPR used for branch target (CTR, LR, TAR)
+	ispro: gspr_index_t; -- (G)SPR written with LR or CTR
 	decode: decode_rom_t;
        br_pred: std_ulogic; -- Branch was predicted to be taken
+        big_endian: std_ulogic;
    end record;
    constant Decode1ToDecode2Init : Decode1ToDecode2Type :=
        (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'),
-         ispr1 => (others => '0'), ispr2 => (others => '0'), decode => decode_rom_init, br_pred => '0');
+         ispr1 => (others => '0'), ispr2 => (others => '0'), ispro => (others => '0'),
+         decode => decode_rom_init, br_pred => '0', big_endian => '0');

    type Decode1ToFetch1Type is record
        redirect     : std_ulogic;
        redirect_nia : std_ulogic_vector(63 downto 0);
    end record;

+    type bypass_data_t is record
+        tag  : instr_tag_t;
+        data : std_ulogic_vector(63 downto 0);
+    end record;
+    constant bypass_data_init : bypass_data_t := (tag => instr_tag_init, data => (others => '0'));
+
+    type cr_bypass_data_t is record
+        tag  : instr_tag_t;
+        data : std_ulogic_vector(31 downto 0);
+    end record;
+    constant cr_bypass_data_init : cr_bypass_data_t := (tag => instr_tag_init, data => (others => '0'));
+
    type Decode2ToExecute1Type is record
 	valid: std_ulogic;
        unit : unit_t;
+        fac : facility_t;
 	insn_type: insn_type_t;
 	nia: std_ulogic_vector(63 downto 0);
+        instr_tag : instr_tag_t;
 	write_reg: gspr_index_t;
+        write_reg_enable: std_ulogic;
 	read_reg1: gspr_index_t;
 	read_reg2: gspr_index_t;
 	read_data1: std_ulogic_vector(63 downto 0);
 	read_data2: std_ulogic_vector(63 downto 0);
 	read_data3: std_ulogic_vector(63 downto 0);
-        bypass_data1: std_ulogic;
-        bypass_data2: std_ulogic;
-        bypass_data3: std_ulogic;
 	cr: std_ulogic_vector(31 downto 0);
-        bypass_cr : std_ulogic;
 	xerc: xer_common_t;
 	lr: std_ulogic;
+        br_abs: std_ulogic;
 	rc: std_ulogic;
 	oe: std_ulogic;
 	invert_a: std_ulogic;
+        addm1 : std_ulogic;
 	invert_out: std_ulogic;
 	input_carry: carry_in_t;
 	output_carry: std_ulogic;
 	input_cr: std_ulogic;
 	output_cr: std_ulogic;
+        output_xer: std_ulogic;
 	is_32bit: std_ulogic;
 	is_signed: std_ulogic;
 	insn: std_ulogic_vector(31 downto 0);
@ -220,13 +292,23 @@ package common is
 	update : std_ulogic;				-- is this an update instruction?
        reserve : std_ulogic;                           -- set for larx/stcx
        br_pred : std_ulogic;
+        result_sel : std_ulogic_vector(2 downto 0);     -- select source of result
+        sub_select : std_ulogic_vector(2 downto 0);     -- sub-result selection
+        repeat : std_ulogic;                            -- set if instruction is cracked into two ops
+        second : std_ulogic;                            -- set if this is the second op
    end record;
    constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
-	(valid => '0', unit => NONE, insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0',
-         bypass_cr => '0', lr => '0', rc => '0', oe => '0', invert_a => '0',
-	 invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0',
+	(valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL, instr_tag => instr_tag_init,
+         write_reg_enable => '0',
+         lr => '0', br_abs => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0',
+	 invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0',
+         output_cr => '0', output_xer => '0',
 	 is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0',
-         byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'), read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'), cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'), others => (others => '0'));
+         byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'),
+         read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'),
+         cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'),
+         result_sel => "000", sub_select => "000",
+         repeat => '0', second => '0', others => (others => '0'));

    type MultiplyInputType is record
 	valid: std_ulogic;
@ -262,6 +344,51 @@ package common is
                                                              is_extended => '0', is_modulus => '0',
                                                              neg_result => '0', others => (others => '0'));

+    type PMUEventType is record
+        no_instr_avail      : std_ulogic;
+        dispatch            : std_ulogic;
+        ext_interrupt       : std_ulogic;
+        instr_complete      : std_ulogic;
+        fp_complete         : std_ulogic;
+        ld_complete         : std_ulogic;
+        st_complete         : std_ulogic;
+        br_taken_complete   : std_ulogic;
+        br_mispredict       : std_ulogic;
+        ipref_discard       : std_ulogic;
+        itlb_miss           : std_ulogic;
+        itlb_miss_resolved  : std_ulogic;
+        icache_miss         : std_ulogic;
+        dc_miss_resolved    : std_ulogic;
+        dc_load_miss        : std_ulogic;
+        dc_ld_miss_resolved : std_ulogic;
+        dc_store_miss       : std_ulogic;
+        dtlb_miss           : std_ulogic;
+        dtlb_miss_resolved  : std_ulogic;
+        ld_miss_nocache     : std_ulogic;
+        ld_fill_nocache     : std_ulogic;
+    end record;
+    constant PMUEventInit : PMUEventType := (others => '0');
+
+    type Execute1ToPMUType is record
+        mfspr   : std_ulogic;
+        mtspr   : std_ulogic;
+        spr_num : std_ulogic_vector(4 downto 0);
+        spr_val : std_ulogic_vector(63 downto 0);
+        tbbits  : std_ulogic_vector(3 downto 0);        -- event bits from timebase
+        pmm_msr : std_ulogic;                           -- PMM bit from MSR
+        pr_msr  : std_ulogic;                           -- PR bit from MSR
+        run     : std_ulogic;
+        nia     : std_ulogic_vector(63 downto 0);
+        addr    : std_ulogic_vector(63 downto 0);
+        addr_v  : std_ulogic;
+        occur   : PMUEventType;
+    end record;
+
+    type PMUToExecute1Type is record
+        spr_val : std_ulogic_vector(63 downto 0);
+        intr    : std_ulogic;
+    end record;
+
    type Decode2ToRegisterFileType is record
 	read1_enable : std_ulogic;
 	read1_reg : gspr_index_t;
@ -272,9 +399,9 @@ package common is
    end record;

    type RegisterFileToDecode2Type is record
-	read1_data : std_ulogic_vector(63 downto 0);
-	read2_data : std_ulogic_vector(63 downto 0);
-	read3_data : std_ulogic_vector(63 downto 0);
+        read1_data : std_ulogic_vector(63 downto 0);
+        read2_data : std_ulogic_vector(63 downto 0);
+        read3_data : std_ulogic_vector(63 downto 0);
    end record;

    type Decode2ToCrFileType is record
@ -286,23 +413,12 @@ package common is
 	read_xerc_data : xer_common_t;
    end record;

-    type Execute1ToFetch1Type is record
-	redirect: std_ulogic;
-        virt_mode: std_ulogic;
-        priv_mode: std_ulogic;
-        big_endian: std_ulogic;
-        mode_32bit: std_ulogic;
-	redirect_nia: std_ulogic_vector(63 downto 0);
-    end record;
-    constant Execute1ToFetch1Init : Execute1ToFetch1Type := (redirect => '0', virt_mode => '0',
-                                                             priv_mode => '0', big_endian => '0',
-                                                             mode_32bit => '0', others => (others => '0'));
-
    type Execute1ToLoadstore1Type is record
 	valid : std_ulogic;
        op : insn_type_t;                               -- what ld/st or m[tf]spr or TLB op to do
        nia : std_ulogic_vector(63 downto 0);
        insn : std_ulogic_vector(31 downto 0);
+        instr_tag : instr_tag_t;
 	addr1 : std_ulogic_vector(63 downto 0);
 	addr2 : std_ulogic_vector(63 downto 0);
 	data : std_ulogic_vector(63 downto 0);		-- data to write, unused for read
@ -312,7 +428,6 @@ package common is
 	byte_reverse : std_ulogic;
 	sign_extend : std_ulogic;			-- do we need to sign extend?
 	update : std_ulogic;				-- is this an update instruction?
-	update_reg : gpr_index_t;                      	-- if so, the register to update
 	xerc : xer_common_t;
        reserve : std_ulogic;                           -- set for larx/stcx.
        rc : std_ulogic;                                -- set for stcx.
@ -320,37 +435,42 @@ package common is
        priv_mode : std_ulogic;                         -- privileged mode (MSR[PR] = 0)
        mode_32bit : std_ulogic;                        -- trim addresses to 32 bits
        is_32bit : std_ulogic;
-    end record;
-    constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0',
-                                                                     sign_extend => '0', update => '0', xerc => xerc_init,
-                                                                     reserve => '0', rc => '0', virt_mode => '0', priv_mode => '0',
-                                                                     nia => (others => '0'), insn => (others => '0'),
-                                                                     addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'),
-                                                                     write_reg => (others => '0'), length => (others => '0'),
-                                                                     mode_32bit => '0', is_32bit => '0', others => (others => '0'));
+        repeat : std_ulogic;
+        second : std_ulogic;
+        msr : std_ulogic_vector(63 downto 0);
+    end record;
+    constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type :=
+        (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0',
+         sign_extend => '0', update => '0', xerc => xerc_init,
+         reserve => '0', rc => '0', virt_mode => '0', priv_mode => '0',
+         nia => (others => '0'), insn => (others => '0'),
+         instr_tag => instr_tag_init,
+         addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'),
+         write_reg => (others => '0'),
+         length => (others => '0'),
+         mode_32bit => '0', is_32bit => '0',
+         repeat => '0', second => '0',
+         msr => (others => '0'));

    type Loadstore1ToExecute1Type is record
        busy : std_ulogic;
-        exception : std_ulogic;
-        alignment : std_ulogic;
-        invalid : std_ulogic;
-        perm_error : std_ulogic;
-        rc_error : std_ulogic;
-        badtree : std_ulogic;
-        segment_fault : std_ulogic;
-        instr_fault : std_ulogic;
+        in_progress : std_ulogic;
+        interrupt : std_ulogic;
    end record;

    type Loadstore1ToDcacheType is record
 	valid : std_ulogic;
+        hold : std_ulogic;
 	load : std_ulogic;				-- is this a load
        dcbz : std_ulogic;
 	nc : std_ulogic;
        reserve : std_ulogic;
+        atomic : std_ulogic;                            -- part of a multi-transfer atomic op
+        atomic_last : std_ulogic;
        virt_mode : std_ulogic;
        priv_mode : std_ulogic;
 	addr : std_ulogic_vector(63 downto 0);
-	data : std_ulogic_vector(63 downto 0);
+	data : std_ulogic_vector(63 downto 0);          -- valid the cycle after .valid = 1
        byte_sel : std_ulogic_vector(7 downto 0);
    end record;

@ -362,6 +482,14 @@ package common is
        cache_paradox : std_ulogic;
    end record;

+    type DcacheEventType is record
+        load_miss          : std_ulogic;
+        store_miss         : std_ulogic;
+        dcache_refill      : std_ulogic;
+        dtlb_miss          : std_ulogic;
+        dtlb_miss_resolved : std_ulogic;
+    end record;
+
    type Loadstore1ToMmuType is record
        valid : std_ulogic;
        tlbie : std_ulogic;
@ -412,18 +540,34 @@ package common is

    type Loadstore1ToWritebackType is record
 	valid : std_ulogic;
+        instr_tag : instr_tag_t;
 	write_enable: std_ulogic;
 	write_reg : gspr_index_t;
 	write_data : std_ulogic_vector(63 downto 0);
 	xerc : xer_common_t;
        rc : std_ulogic;
        store_done : std_ulogic;
+        interrupt : std_ulogic;
+        intr_vec : intr_vector_t;
+        srr0: std_ulogic_vector(63 downto 0);
+        srr1: std_ulogic_vector(15 downto 0);
+    end record;
+    constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType :=
+        (valid => '0', instr_tag => instr_tag_init, write_enable => '0',
+         write_reg => (others => '0'), write_data => (others => '0'),
+         xerc => xerc_init, rc => '0', store_done => '0',
+         interrupt => '0', intr_vec => 0,
+         srr0 => (others => '0'), srr1 => (others => '0'));
+
+    type Loadstore1EventType is record
+        load_complete  : std_ulogic;
+        store_complete : std_ulogic;
+        itlb_miss      : std_ulogic;
    end record;
-    constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType := (valid => '0', write_enable => '0', xerc => xerc_init,
-                                                                       rc => '0', store_done => '0', write_data => (others => '0'), others => (others => '0'));

    type Execute1ToWritebackType is record
 	valid: std_ulogic;
+        instr_tag : instr_tag_t;
 	rc : std_ulogic;
        mode_32bit : std_ulogic;
 	write_enable : std_ulogic;
@ -434,21 +578,34 @@ package common is
 	write_cr_data : std_ulogic_vector(31 downto 0);
 	write_xerc_enable : std_ulogic;
 	xerc : xer_common_t;
-        exc_write_enable : std_ulogic;
-        exc_write_reg : gspr_index_t;
-        exc_write_data : std_ulogic_vector(63 downto 0);
-    end record;
-    constant Execute1ToWritebackInit : Execute1ToWritebackType := (valid => '0', rc => '0', mode_32bit => '0', write_enable => '0',
-								   write_cr_enable => '0', exc_write_enable => '0',
-								   write_xerc_enable => '0', xerc => xerc_init,
-                                   write_data => (others => '0'), write_cr_mask => (others => '0'),
-                                   write_cr_data => (others => '0'), write_reg => (others => '0'),
-                                   exc_write_reg => (others => '0'), exc_write_data => (others => '0'));
+        interrupt : std_ulogic;
+        intr_vec : intr_vector_t;
+	redirect: std_ulogic;
+        redir_mode: std_ulogic_vector(3 downto 0);
+        last_nia: std_ulogic_vector(63 downto 0);
+        br_offset: std_ulogic_vector(63 downto 0);
+        br_last: std_ulogic;
+        br_taken: std_ulogic;
+        abs_br: std_ulogic;
+        srr1: std_ulogic_vector(15 downto 0);
+        msr: std_ulogic_vector(63 downto 0);
+    end record;
+    constant Execute1ToWritebackInit : Execute1ToWritebackType :=
+        (valid => '0', instr_tag => instr_tag_init, rc => '0', mode_32bit => '0',
+         write_enable => '0', write_cr_enable => '0',
+         write_xerc_enable => '0', xerc => xerc_init,
+         write_data => (others => '0'), write_cr_mask => (others => '0'),
+         write_cr_data => (others => '0'), write_reg => (others => '0'),
+         interrupt => '0', intr_vec => 0, redirect => '0', redir_mode => "0000",
+         last_nia => (others => '0'), br_offset => (others => '0'),
+         br_last => '0', br_taken => '0', abs_br => '0',
+         srr1 => (others => '0'), msr => (others => '0'));

    type Execute1ToFPUType is record
        valid   : std_ulogic;
        op      : insn_type_t;
        nia     : std_ulogic_vector(63 downto 0);
+        itag    : instr_tag_t;
        insn    : std_ulogic_vector(31 downto 0);
        single  : std_ulogic;
        fe_mode : std_ulogic_vector(1 downto 0);
@ -460,6 +617,7 @@ package common is
        out_cr  : std_ulogic;
    end record;
    constant Execute1ToFPUInit : Execute1ToFPUType := (valid => '0', op => OP_ILLEGAL, nia => (others => '0'),
+                                                       itag => instr_tag_init,
                                                       insn  => (others => '0'), fe_mode => "00", rc => '0',
                                                       fra => (others => '0'), frb => (others => '0'),
                                                       frc => (others => '0'), frt => (others => '0'),
@ -468,19 +626,30 @@ package common is
    type FPUToExecute1Type is record
        busy      : std_ulogic;
        exception : std_ulogic;
-        interrupt : std_ulogic;
-        illegal   : std_ulogic;
    end record;
+    constant FPUToExecute1Init : FPUToExecute1Type := (others => '0');

    type FPUToWritebackType is record
        valid           : std_ulogic;
+        interrupt       : std_ulogic;
+        instr_tag       : instr_tag_t;
        write_enable    : std_ulogic;
        write_reg       : gspr_index_t;
        write_data      : std_ulogic_vector(63 downto 0);
        write_cr_enable : std_ulogic;
        write_cr_mask   : std_ulogic_vector(7 downto 0);
        write_cr_data   : std_ulogic_vector(31 downto 0);
-    end record;
+        intr_vec        : intr_vector_t;
+        srr0            : std_ulogic_vector(63 downto 0);
+        srr1            : std_ulogic_vector(15 downto 0);
+    end record;
+    constant FPUToWritebackInit : FPUToWritebackType :=
+        (valid => '0', interrupt => '0', instr_tag => instr_tag_init,
+         write_enable => '0', write_reg => (others => '0'),
+         write_cr_enable => '0', write_cr_mask => (others => '0'),
+         write_cr_data => (others => '0'),
+         intr_vec => 0, srr1 => (others => '0'),
+         others => (others => '0'));

    type DividerToExecute1Type is record
 	valid: std_ulogic;
@ -490,12 +659,29 @@ package common is
    constant DividerToExecute1Init : DividerToExecute1Type := (valid => '0', overflow => '0',
                                                               others => (others => '0'));

+    type WritebackToFetch1Type is record
+	redirect: std_ulogic;
+        virt_mode: std_ulogic;
+        priv_mode: std_ulogic;
+        big_endian: std_ulogic;
+        mode_32bit: std_ulogic;
+	redirect_nia: std_ulogic_vector(63 downto 0);
+        br_nia : std_ulogic_vector(63 downto 0);
+        br_last : std_ulogic;
+        br_taken : std_ulogic;
+    end record;
+    constant WritebackToFetch1Init : WritebackToFetch1Type :=
+        (redirect => '0', virt_mode => '0', priv_mode => '0', big_endian => '0',
+         mode_32bit => '0', redirect_nia => (others => '0'),
+         br_last => '0', br_taken => '0', br_nia => (others => '0'));
+
    type WritebackToRegisterFileType is record
 	write_reg : gspr_index_t;
 	write_data : std_ulogic_vector(63 downto 0);
 	write_enable : std_ulogic;
    end record;
-    constant WritebackToRegisterFileInit : WritebackToRegisterFileType := (write_enable => '0', write_data => (others => '0'), others => (others => '0'));
+    constant WritebackToRegisterFileInit : WritebackToRegisterFileType :=
+        (write_enable => '0', write_data => (others => '0'), others => (others => '0'));

    type WritebackToCrFileType is record
 	write_cr_enable : std_ulogic;
@ -509,6 +695,11 @@ package common is
 							       write_cr_mask => (others => '0'),
 							       write_cr_data => (others => '0'));

+    type WritebackEventType is record
+        instr_complete : std_ulogic;
+        fp_complete    : std_ulogic;
+    end record;
+
 end common;

 package body common is
@ -525,9 +716,9 @@ package body common is
    begin
       case spr is
       when SPR_LR =>
-           n := 0;
+           n := 0;              -- N.B. decode2 relies on this specific value
       when SPR_CTR =>
-           n:= 1;
+           n := 1;              -- N.B. decode2 relies on this specific value
       when SPR_SRR0 =>
           n := 2;
       when SPR_SRR1 =>
@ -588,4 +779,14 @@ package body common is
    begin
        return "10" & f;
    end;
+
+    function tag_match(tag1 : instr_tag_t; tag2 : instr_tag_t) return boolean is
+    begin
+        return tag1.valid = '1' and tag2.valid = '1' and tag1.tag = tag2.tag;
+    end;
+
+    function addr_to_real(addr: std_ulogic_vector(63 downto 0)) return real_addr_t is
+    begin
+        return addr(real_addr_t'range);
+    end;
 end common;
--- a/constraints/orange-crab-0.2.lpf
+++ b/constraints/orange-crab-0.2.lpf
@ -0,0 +1,225 @@
+LOCATE COMP "ext_clk" SITE "A9";
+IOBUF PORT "ext_clk" IO_TYPE=LVCMOS33;
+
+// LOCATE COMP "ext_rst_n" SITE "J2"; // io_13
+// IOBUF PORT "ext_rst_n" PULLMODE=UP IO_TYPE=LVCMOS33 DRIVE=4;
+
+// user_button as reset
+LOCATE COMP "ext_rst_n" SITE "J17";
+IOBUF PORT "ext_rst_n" IO_TYPE=SSTL135_I;
+
+LOCATE COMP "usb_d_p" SITE "N1";
+LOCATE COMP "usb_d_n" SITE "M2";
+LOCATE COMP "usb_pullup" SITE "N2";
+
+IOBUF PORT "usb_d_p" IO_TYPE=LVCMOS33;
+IOBUF PORT "usb_d_n" IO_TYPE=LVCMOS33;
+IOBUF PORT "usb_pullup" IO_TYPE=LVCMOS33;
+
+LOCATE COMP "led0_g" SITE "M3";
+LOCATE COMP "led0_r" SITE "K4";
+LOCATE COMP "led0_b" SITE "J3";
+
+IOBUF PORT "led0_g" IO_TYPE=LVCMOS33;
+IOBUF PORT "led0_g" IO_TYPE=LVCMOS33;
+IOBUF PORT "led0_b" IO_TYPE=LVCMOS33;
+
+// discontinuous gpio numbers, match orangecrab litex platform
+LOCATE COMP "pin_gpio_0" SITE "N17"; // tx
+LOCATE COMP "pin_gpio_1" SITE "M18"; // rx
+LOCATE COMP "pin_gpio_2" SITE "C10"; // sda
+LOCATE COMP "pin_gpio_3" SITE "C9"; // scl
+//
+LOCATE COMP "pin_gpio_5" SITE "B10"; // io_5
+LOCATE COMP "pin_gpio_6" SITE "B9"; // ...
+//
+LOCATE COMP "pin_gpio_9" SITE "C8"; //
+LOCATE COMP "pin_gpio_10" SITE "B8"; //
+LOCATE COMP "pin_gpio_11" SITE "A8"; //
+LOCATE COMP "pin_gpio_12" SITE "H2"; //
+LOCATE COMP "pin_gpio_13" SITE "J2"; // io_13
+LOCATE COMP "pin_gpio_14" SITE "N15"; // miso
+LOCATE COMP "pin_gpio_15" SITE "R17"; // sck
+LOCATE COMP "pin_gpio_16" SITE "N16"; // mosi
+
+LOCATE COMP "pin_io_a0" SITE "L4";
+LOCATE COMP "pin_io_a1" SITE "N3";
+LOCATE COMP "pin_io_a2" SITE "N4";
+LOCATE COMP "pin_io_a3" SITE "H4";
+LOCATE COMP "pin_io_a4" SITE "G4";
+LOCATE COMP "pin_io_a5" SITE "T17";
+
+IOBUF PORT "pin_gpio_0" IO_TYPE=LVCMOS33;
+IOBUF PORT "pin_gpio_1" IO_TYPE=LVCMOS33;
+IOBUF PORT "pin_gpio_2" IO_TYPE=LVCMOS33;
+IOBUF PORT "pin_gpio_3" IO_TYPE=LVCMOS33;
+IOBUF PORT "pin_gpio_5" IO_TYPE=LVCMOS33;
+IOBUF PORT "pin_gpio_6" IO_TYPE=LVCMOS33;
+IOBUF PORT "pin_gpio_9" IO_TYPE=LVCMOS33;
+IOBUF PORT "pin_gpio_10" IO_TYPE=LVCMOS33;
+IOBUF PORT "pin_gpio_11" IO_TYPE=LVCMOS33;
+IOBUF PORT "pin_gpio_12" IO_TYPE=LVCMOS33;
+IOBUF PORT "pin_gpio_13" IO_TYPE=LVCMOS33;
+IOBUF PORT "pin_gpio_14" IO_TYPE=LVCMOS33;
+IOBUF PORT "pin_gpio_15" IO_TYPE=LVCMOS33;
+IOBUF PORT "pin_gpio_16" IO_TYPE=LVCMOS33;
+IOBUF PORT "pin_io_a0" IO_TYPE=LVCMOS33;
+IOBUF PORT "pin_io_a1" IO_TYPE=LVCMOS33;
+IOBUF PORT "pin_io_a2" IO_TYPE=LVCMOS33;
+IOBUF PORT "pin_io_a3" IO_TYPE=LVCMOS33;
+IOBUF PORT "pin_io_a4" IO_TYPE=LVCMOS33;
+IOBUF PORT "pin_io_a5" IO_TYPE=LVCMOS33;
+
+LOCATE COMP "ddram_a[0]"  SITE "C4";
+LOCATE COMP "ddram_a[1]"  SITE "D2";
+LOCATE COMP "ddram_a[2]"  SITE "D3";
+LOCATE COMP "ddram_a[3]"  SITE "A3";
+LOCATE COMP "ddram_a[4]"  SITE "A4";
+LOCATE COMP "ddram_a[5]"  SITE "D4";
+LOCATE COMP "ddram_a[6]"  SITE "C3";
+LOCATE COMP "ddram_a[7]"  SITE "B2";
+LOCATE COMP "ddram_a[8]"  SITE "B1";
+LOCATE COMP "ddram_a[9]"  SITE "D1";
+LOCATE COMP "ddram_a[10]" SITE "A7";
+LOCATE COMP "ddram_a[11]" SITE "C2";
+LOCATE COMP "ddram_a[12]" SITE "B6";
+LOCATE COMP "ddram_a[13]" SITE "C1";
+LOCATE COMP "ddram_a[14]" SITE "A2";
+LOCATE COMP "ddram_a[15]" SITE "C7";
+IOBUF PORT "ddram_a[0]"  IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_a[1]"  IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_a[2]"  IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_a[3]"  IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_a[4]"  IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_a[5]"  IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_a[6]"  IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_a[7]"  IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_a[8]"  IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_a[9]"  IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_a[10]" IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_a[11]" IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_a[12]" IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_a[13]" IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_a[14]" IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_a[15]" IO_TYPE=SSTL135_I SLEWRATE=FAST;
+
+LOCATE COMP "ddram_ba[0]" SITE "D6";
+LOCATE COMP "ddram_ba[1]" SITE "B7";
+LOCATE COMP "ddram_ba[2]" SITE "A6";
+LOCATE COMP "ddram_cas_n" SITE "D13";
+LOCATE COMP "ddram_cs_n"  SITE "A12";
+LOCATE COMP "ddram_dm[0]" SITE "D16";
+LOCATE COMP "ddram_dm[1]" SITE "G16";
+LOCATE COMP "ddram_ras_n" SITE "C12";
+LOCATE COMP "ddram_we_n"  SITE "B12";
+IOBUF PORT "ddram_ba[0]" IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_ba[1]" IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_ba[2]" IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_cas_n" IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_cs_n"  IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_dm[0]" IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_dm[1]" IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_ras_n" IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_we_n"  IO_TYPE=SSTL135_I SLEWRATE=FAST;
+
+// from litex platform, termination disabled to reduce heat
+LOCATE COMP "ddram_dq[0]"  SITE "C17";
+LOCATE COMP "ddram_dq[1]"  SITE "D15";
+LOCATE COMP "ddram_dq[2]"  SITE "B17";
+LOCATE COMP "ddram_dq[3]"  SITE "C16";
+LOCATE COMP "ddram_dq[4]"  SITE "A15";
+LOCATE COMP "ddram_dq[5]"  SITE "B13";
+LOCATE COMP "ddram_dq[6]"  SITE "A17";
+LOCATE COMP "ddram_dq[7]"  SITE "A13";
+LOCATE COMP "ddram_dq[8]"  SITE "F17";
+LOCATE COMP "ddram_dq[9]"  SITE "F16";
+LOCATE COMP "ddram_dq[10]" SITE "G15";
+LOCATE COMP "ddram_dq[11]" SITE "F15";
+LOCATE COMP "ddram_dq[12]" SITE "J16";
+LOCATE COMP "ddram_dq[13]" SITE "C18";
+LOCATE COMP "ddram_dq[14]" SITE "H16";
+LOCATE COMP "ddram_dq[15]" SITE "F18";
+IOBUF PORT "ddram_dq[0]"  IO_TYPE=SSTL135_I SLEWRATE=FAST TERMINATION=OFF;
+IOBUF PORT "ddram_dq[1]"  IO_TYPE=SSTL135_I SLEWRATE=FAST TERMINATION=OFF;
+IOBUF PORT "ddram_dq[2]"  IO_TYPE=SSTL135_I SLEWRATE=FAST TERMINATION=OFF;
+IOBUF PORT "ddram_dq[3]"  IO_TYPE=SSTL135_I SLEWRATE=FAST TERMINATION=OFF;
+IOBUF PORT "ddram_dq[4]"  IO_TYPE=SSTL135_I SLEWRATE=FAST TERMINATION=OFF;
+IOBUF PORT "ddram_dq[5]"  IO_TYPE=SSTL135_I SLEWRATE=FAST TERMINATION=OFF;
+IOBUF PORT "ddram_dq[6]"  IO_TYPE=SSTL135_I SLEWRATE=FAST TERMINATION=OFF;
+IOBUF PORT "ddram_dq[7]"  IO_TYPE=SSTL135_I SLEWRATE=FAST TERMINATION=OFF;
+IOBUF PORT "ddram_dq[8]"  IO_TYPE=SSTL135_I SLEWRATE=FAST TERMINATION=OFF;
+IOBUF PORT "ddram_dq[9]"  IO_TYPE=SSTL135_I SLEWRATE=FAST TERMINATION=OFF;
+IOBUF PORT "ddram_dq[10]" IO_TYPE=SSTL135_I SLEWRATE=FAST TERMINATION=OFF;
+IOBUF PORT "ddram_dq[11]" IO_TYPE=SSTL135_I SLEWRATE=FAST TERMINATION=OFF;
+IOBUF PORT "ddram_dq[12]" IO_TYPE=SSTL135_I SLEWRATE=FAST TERMINATION=OFF;
+IOBUF PORT "ddram_dq[13]" IO_TYPE=SSTL135_I SLEWRATE=FAST TERMINATION=OFF;
+IOBUF PORT "ddram_dq[14]" IO_TYPE=SSTL135_I SLEWRATE=FAST TERMINATION=OFF;
+IOBUF PORT "ddram_dq[15]" IO_TYPE=SSTL135_I SLEWRATE=FAST TERMINATION=OFF;
+
+LOCATE COMP "ddram_dqs_n[0]" SITE "A16";
+LOCATE COMP "ddram_dqs_n[1]" SITE "H17";
+LOCATE COMP "ddram_dqs_p[0]" SITE "B15";
+LOCATE COMP "ddram_dqs_p[1]" SITE "G18";
+IOBUF PORT "ddram_dqs_n[0]" IO_TYPE=SSTL135D_I SLEWRATE=FAST DIFFRESISTOR=100 TERMINATION=OFF;
+IOBUF PORT "ddram_dqs_n[1]" IO_TYPE=SSTL135D_I SLEWRATE=FAST DIFFRESISTOR=100 TERMINATION=OFF;
+IOBUF PORT "ddram_dqs_p[0]" IO_TYPE=SSTL135D_I SLEWRATE=FAST DIFFRESISTOR=100 TERMINATION=OFF;
+IOBUF PORT "ddram_dqs_p[1]" IO_TYPE=SSTL135D_I SLEWRATE=FAST DIFFRESISTOR=100 TERMINATION=OFF;
+
+LOCATE COMP "ddram_clk_p" SITE "J18";
+LOCATE COMP "ddram_clk_n" SITE "K18";
+IOBUF PORT "ddram_clk_p" IO_TYPE=SSTL135D_I SLEWRATE=FAST;
+IOBUF PORT "ddram_clk_n" IO_TYPE=SSTL135D_I SLEWRATE=FAST;
+
+LOCATE COMP "ddram_cke"     SITE "D18";
+LOCATE COMP "ddram_odt"     SITE "C13";
+LOCATE COMP "ddram_reset_n" SITE "L18";
+IOBUF PORT "ddram_cke"     IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_odt"     IO_TYPE=SSTL135_I SLEWRATE=FAST;
+IOBUF PORT "ddram_reset_n" IO_TYPE=SSTL135_I SLEWRATE=FAST;
+
+LOCATE COMP "ddram_vccio[0]" SITE "K16";
+LOCATE COMP "ddram_vccio[1]" SITE "D17";
+LOCATE COMP "ddram_vccio[2]" SITE "K15";
+LOCATE COMP "ddram_vccio[3]" SITE "K17";
+LOCATE COMP "ddram_vccio[4]" SITE "B18";
+LOCATE COMP "ddram_vccio[5]" SITE "C6";
+LOCATE COMP "ddram_gnd[0]"   SITE "L15";
+LOCATE COMP "ddram_gnd[1]"   SITE "L16";
+IOBUF PORT "ddram_vccio[0]" IO_TYPE=SSTL135_II SLEWRATE=FAST;
+IOBUF PORT "ddram_vccio[1]" IO_TYPE=SSTL135_II SLEWRATE=FAST;
+IOBUF PORT "ddram_vccio[2]" IO_TYPE=SSTL135_II SLEWRATE=FAST;
+IOBUF PORT "ddram_vccio[3]" IO_TYPE=SSTL135_II SLEWRATE=FAST;
+IOBUF PORT "ddram_vccio[4]" IO_TYPE=SSTL135_II SLEWRATE=FAST;
+IOBUF PORT "ddram_vccio[5]" IO_TYPE=SSTL135_II SLEWRATE=FAST;
+IOBUF PORT "ddram_gnd[0]"   IO_TYPE=SSTL135_II SLEWRATE=FAST;
+IOBUF PORT "ddram_gnd[1]"   IO_TYPE=SSTL135_II SLEWRATE=FAST;
+
+// We use USRMCLK instead for clk
+// LOCATE COMP "spi_flash_clk" SITE "U16";
+// IOBUF PORT "spi_flash_clk" IO_TYPE=LVCMOS33;
+LOCATE COMP "spi_flash_cs_n" SITE "U17";
+IOBUF PORT "spi_flash_cs_n" IO_TYPE=LVCMOS33;
+LOCATE COMP "spi_flash_mosi" SITE "U18";
+IOBUF PORT "spi_flash_mosi" IO_TYPE=LVCMOS33;
+LOCATE COMP "spi_flash_miso" SITE "T18";
+IOBUF PORT "spi_flash_miso" IO_TYPE=LVCMOS33;
+LOCATE COMP "spi_flash_wp_n" SITE "R18";
+IOBUF PORT "spi_flash_wp_n" IO_TYPE=LVCMOS33;
+LOCATE COMP "spi_flash_hold_n" SITE "N18";
+IOBUF PORT "spi_flash_hold_n" IO_TYPE=LVCMOS33;
+
+LOCATE COMP "sdcard_data[0]" SITE "J1";
+LOCATE COMP "sdcard_data[1]" SITE "K3";
+LOCATE COMP "sdcard_data[2]" SITE "L3";
+LOCATE COMP "sdcard_data[3]" SITE "M1";
+LOCATE COMP "sdcard_cmd" SITE "K2";
+LOCATE COMP "sdcard_clk" SITE "K1";
+LOCATE COMP "sdcard_cd" SITE "L1";
+
+IOBUF PORT "sdcard_data[0]" IO_TYPE=LVCMOS33 SLEWRATE=FAST PULLMODE=UP;
+IOBUF PORT "sdcard_data[1]" IO_TYPE=LVCMOS33 SLEWRATE=FAST PULLMODE=UP;
+IOBUF PORT "sdcard_data[2]" IO_TYPE=LVCMOS33 SLEWRATE=FAST PULLMODE=UP;
+IOBUF PORT "sdcard_data[3]" IO_TYPE=LVCMOS33 SLEWRATE=FAST PULLMODE=UP;
+IOBUF PORT "sdcard_cmd" IO_TYPE=LVCMOS33 SLEWRATE=FAST PULLMODE=UP;
+IOBUF PORT "sdcard_clk" IO_TYPE=LVCMOS33 SLEWRATE=FAST;
+IOBUF PORT "sdcard_cd" IO_TYPE=LVCMOS33;
--- a/control.vhdl
+++ b/control.vhdl
@ -6,26 +6,24 @@ use work.common.all;

 entity control is
    generic (
-        PIPELINE_DEPTH : natural := 2
+        EX1_BYPASS : boolean := true;
+        PIPELINE_DEPTH : natural := 3
        );
    port (
        clk                 : in std_ulogic;
        rst                 : in std_ulogic;

-        complete_in         : in std_ulogic;
+        complete_in         : in instr_tag_t;
        valid_in            : in std_ulogic;
+        repeated            : in std_ulogic;
        flush_in            : in std_ulogic;
-	busy_in             : in std_ulogic;
+        busy_in             : in std_ulogic;
        deferred            : in std_ulogic;
        sgl_pipe_in         : in std_ulogic;
        stop_mark_in        : in std_ulogic;

        gpr_write_valid_in  : in std_ulogic;
        gpr_write_in        : in gspr_index_t;
-        gpr_bypassable      : in std_ulogic;
-
-        update_gpr_write_valid : in std_ulogic;
-        update_gpr_write_reg : in gspr_index_t;

        gpr_a_read_valid_in : in std_ulogic;
        gpr_a_read_in       : in gspr_index_t;
@ -36,9 +34,11 @@ entity control is
        gpr_c_read_valid_in : in std_ulogic;
        gpr_c_read_in       : in gspr_index_t;

+        execute_next_tag    : in instr_tag_t;
+        execute_next_cr_tag : in instr_tag_t;
+
        cr_read_in          : in std_ulogic;
        cr_write_in         : in std_ulogic;
-        cr_bypassable       : in std_ulogic;

        valid_out           : out std_ulogic;
        stall_out           : out std_ulogic;
@ -47,7 +47,9 @@ entity control is
        gpr_bypass_a        : out std_ulogic;
        gpr_bypass_b        : out std_ulogic;
        gpr_bypass_c        : out std_ulogic;
-        cr_bypass           : out std_ulogic
+        cr_bypass           : out std_ulogic;
+
+        instr_tag_out       : out instr_tag_t
        );
 end entity control;

@ -62,119 +64,165 @@ architecture rtl of control is

    signal r_int, rin_int : reg_internal_type := reg_internal_init;

-    signal stall_a_out  : std_ulogic;
-    signal stall_b_out  : std_ulogic;
-    signal stall_c_out  : std_ulogic;
-    signal cr_stall_out : std_ulogic;
+    signal gpr_write_valid : std_ulogic;
+    signal cr_write_valid  : std_ulogic;

-    signal gpr_write_valid : std_ulogic := '0';
-    signal cr_write_valid  : std_ulogic := '0';
+    type tag_register is record
+        wr_gpr : std_ulogic;
+        reg    : gspr_index_t;
+        recent : std_ulogic;
+        wr_cr  : std_ulogic;
+    end record;

-begin
-    gpr_hazard0: entity work.gpr_hazard
-        generic map (
-            PIPELINE_DEPTH => PIPELINE_DEPTH
-            )
-        port map (
-            clk                => clk,
-            busy_in            => busy_in,
-	    deferred           => deferred,
-            complete_in        => complete_in,
-            flush_in           => flush_in,
-            issuing            => valid_out,
-
-            gpr_write_valid_in => gpr_write_valid,
-            gpr_write_in       => gpr_write_in,
-            bypass_avail       => gpr_bypassable,
-            gpr_read_valid_in  => gpr_a_read_valid_in,
-            gpr_read_in        => gpr_a_read_in,
-
-            ugpr_write_valid   => update_gpr_write_valid,
-            ugpr_write_reg     => update_gpr_write_reg,
-
-            stall_out          => stall_a_out,
-            use_bypass         => gpr_bypass_a
-            );
-
-    gpr_hazard1: entity work.gpr_hazard
-        generic map (
-            PIPELINE_DEPTH => PIPELINE_DEPTH
-            )
-        port map (
-            clk                => clk,
-            busy_in            => busy_in,
-	    deferred           => deferred,
-            complete_in        => complete_in,
-            flush_in           => flush_in,
-            issuing            => valid_out,
-
-            gpr_write_valid_in => gpr_write_valid,
-            gpr_write_in       => gpr_write_in,
-            bypass_avail       => gpr_bypassable,
-            gpr_read_valid_in  => gpr_b_read_valid_in,
-            gpr_read_in        => gpr_b_read_in,
-
-            ugpr_write_valid   => update_gpr_write_valid,
-            ugpr_write_reg     => update_gpr_write_reg,
-
-            stall_out          => stall_b_out,
-            use_bypass         => gpr_bypass_b
-            );
-
-    gpr_hazard2: entity work.gpr_hazard
-        generic map (
-            PIPELINE_DEPTH => PIPELINE_DEPTH
-            )
-        port map (
-            clk                => clk,
-            busy_in            => busy_in,
-	    deferred           => deferred,
-            complete_in        => complete_in,
-            flush_in           => flush_in,
-            issuing            => valid_out,
-
-            gpr_write_valid_in => gpr_write_valid,
-            gpr_write_in       => gpr_write_in,
-            bypass_avail       => gpr_bypassable,
-            gpr_read_valid_in  => gpr_c_read_valid_in,
-            gpr_read_in        => gpr_c_read_in,
-
-            ugpr_write_valid   => update_gpr_write_valid,
-            ugpr_write_reg     => update_gpr_write_reg,
-
-            stall_out          => stall_c_out,
-            use_bypass         => gpr_bypass_c
-            );
-
-    cr_hazard0: entity work.cr_hazard
-        generic map (
-            PIPELINE_DEPTH => PIPELINE_DEPTH
-            )
-        port map (
-            clk                => clk,
-            busy_in            => busy_in,
-	    deferred           => deferred,
-            complete_in        => complete_in,
-            flush_in           => flush_in,
-            issuing            => valid_out,
-
-            cr_read_in         => cr_read_in,
-            cr_write_in        => cr_write_valid,
-            bypassable         => cr_bypassable,
-
-            stall_out          => cr_stall_out,
-            use_bypass         => cr_bypass
-            );
+    type tag_regs_array is array(tag_number_t) of tag_register;
+    signal tag_regs : tag_regs_array;
+
+    signal instr_tag  : instr_tag_t;

+    signal gpr_tag_stall : std_ulogic;
+    signal cr_tag_stall  : std_ulogic;
+
+    signal curr_tag : tag_number_t;
+    signal next_tag : tag_number_t;
+
+    signal curr_cr_tag : tag_number_t;
+
+begin
    control0: process(clk)
    begin
        if rising_edge(clk) then
            assert rin_int.outstanding >= 0 and rin_int.outstanding <= (PIPELINE_DEPTH+1)
                report "Outstanding bad " & integer'image(rin_int.outstanding) severity failure;
            r_int <= rin_int;
+            for i in tag_number_t loop
+                if rst = '1' or flush_in = '1' then
+                    tag_regs(i).wr_gpr <= '0';
+                    tag_regs(i).wr_cr <= '0';
+                else
+                    if complete_in.valid = '1' and i = complete_in.tag then
+                        tag_regs(i).wr_gpr <= '0';
+                        tag_regs(i).wr_cr <= '0';
+                        report "tag " & integer'image(i) & " not valid";
+                    end if;
+                    if gpr_write_valid = '1' and tag_regs(i).reg = gpr_write_in then
+                        tag_regs(i).recent <= '0';
+                        if tag_regs(i).recent = '1' and tag_regs(i).wr_gpr = '1' then
+                            report "tag " & integer'image(i) & " not recent";
+                        end if;
+                    end if;
+                    if instr_tag.valid = '1' and i = instr_tag.tag then
+                        tag_regs(i).wr_gpr <= gpr_write_valid;
+                        tag_regs(i).reg <= gpr_write_in;
+                        tag_regs(i).recent <= gpr_write_valid;
+                        tag_regs(i).wr_cr <= cr_write_valid;
+                        if gpr_write_valid = '1' then
+                            report "tag " & integer'image(i) & " valid for gpr " & to_hstring(gpr_write_in);
+                        end if;
+                    end if;
+                end if;
+            end loop;
+            if rst = '1' then
+                curr_tag <= 0;
+                curr_cr_tag <= 0;
+            else
+                curr_tag <= next_tag;
+                if cr_write_valid = '1' then
+                    curr_cr_tag <= instr_tag.tag;
+                end if;
+            end if;
        end if;
    end process;

+    control_hazards : process(all)
+        variable gpr_stall : std_ulogic;
+        variable tag_a : instr_tag_t;
+        variable tag_b : instr_tag_t;
+        variable tag_c : instr_tag_t;
+        variable tag_s : instr_tag_t;
+        variable tag_t : instr_tag_t;
+        variable incr_tag : tag_number_t;
+        variable byp_a : std_ulogic;
+        variable byp_b : std_ulogic;
+        variable byp_c : std_ulogic;
+        variable tag_cr : instr_tag_t;
+        variable byp_cr : std_ulogic;
+    begin
+        tag_a := instr_tag_init;
+        for i in tag_number_t loop
+            if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_a_read_in then
+                tag_a.valid := gpr_a_read_valid_in;
+                tag_a.tag := i;
+            end if;
+        end loop;
+        if tag_match(tag_a, complete_in) then
+            tag_a.valid := '0';
+        end if;
+        tag_b := instr_tag_init;
+        for i in tag_number_t loop
+            if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_b_read_in then
+                tag_b.valid := gpr_b_read_valid_in;
+                tag_b.tag := i;
+            end if;
+        end loop;
+        if tag_match(tag_b, complete_in) then
+            tag_b.valid := '0';
+        end if;
+        tag_c := instr_tag_init;
+        for i in tag_number_t loop
+            if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_c_read_in then
+                tag_c.valid := gpr_c_read_valid_in;
+                tag_c.tag := i;
+            end if;
+        end loop;
+        if tag_match(tag_c, complete_in) then
+            tag_c.valid := '0';
+        end if;
+
+        byp_a := '0';
+        if EX1_BYPASS and tag_match(execute_next_tag, tag_a) then
+            byp_a := '1';
+        end if;
+        byp_b := '0';
+        if EX1_BYPASS and tag_match(execute_next_tag, tag_b) then
+            byp_b := '1';
+        end if;
+        byp_c := '0';
+        if EX1_BYPASS and tag_match(execute_next_tag, tag_c) then
+            byp_c := '1';
+        end if;
+
+        gpr_bypass_a <= byp_a;
+        gpr_bypass_b <= byp_b;
+        gpr_bypass_c <= byp_c;
+
+        gpr_tag_stall <= (tag_a.valid and not byp_a) or
+                         (tag_b.valid and not byp_b) or
+                         (tag_c.valid and not byp_c);
+
+        incr_tag := curr_tag;
+        instr_tag.tag <= curr_tag;
+        instr_tag.valid <= valid_out and not deferred;
+        if instr_tag.valid = '1' then
+            incr_tag := (curr_tag + 1) mod TAG_COUNT;
+        end if;
+        next_tag <= incr_tag;
+        instr_tag_out <= instr_tag;
+
+        -- CR hazards
+        tag_cr.tag := curr_cr_tag;
+        tag_cr.valid := cr_read_in and tag_regs(curr_cr_tag).wr_cr;
+        if tag_match(tag_cr, complete_in) then
+            tag_cr.valid := '0';
+        end if;
+        byp_cr := '0';
+        if EX1_BYPASS and tag_match(execute_next_cr_tag, tag_cr) then
+            byp_cr := '1';
+        end if;
+
+        cr_bypass <= byp_cr;
+        cr_tag_stall <= tag_cr.valid and not byp_cr;
+    end process;
+
    control1 : process(all)
        variable v_int : reg_internal_type;
        variable valid_tmp : std_ulogic;
@ -187,13 +235,18 @@ begin
        stall_tmp := '0';

        if flush_in = '1' then
-            -- expect to see complete_in next cycle
-            v_int.outstanding := 1;
-        elsif complete_in = '1' then
+            v_int.outstanding := 0;
+        elsif complete_in.valid = '1' then
            v_int.outstanding := r_int.outstanding - 1;
        end if;
+        if r_int.outstanding >= PIPELINE_DEPTH + 1 then
+            valid_tmp := '0';
+            stall_tmp := '1';
+        end if;

        if rst = '1' then
+            gpr_write_valid <= '0';
+            cr_write_valid <= '0';
            v_int := reg_internal_init;
            valid_tmp := '0';
        end if;
@ -218,8 +271,8 @@ begin
                            v_int.state := WAIT_FOR_CURR_TO_COMPLETE;
                        end if;
                    else
-                        -- let it go out if there are no GPR hazards
-                        stall_tmp := stall_a_out or stall_b_out or stall_c_out or cr_stall_out;
+                        -- let it go out if there are no GPR or CR hazards
+                        stall_tmp := gpr_tag_stall or cr_tag_stall;
                    end if;
                end if;

@ -245,8 +298,8 @@ begin
                                v_int.state := WAIT_FOR_CURR_TO_COMPLETE;
                            end if;
                        else
-                            -- let it go out if there are no GPR hazards
-                            stall_tmp := stall_a_out or stall_b_out or stall_c_out or cr_stall_out;
+                            -- let it go out if there are no GPR or CR hazards
+                            stall_tmp := gpr_tag_stall or cr_tag_stall;
                        end if;
                    end if;
                else
@ -258,15 +311,11 @@ begin
            valid_tmp := '0';
        end if;

-        if valid_tmp = '1' then
-            if deferred = '0' then
-                v_int.outstanding := v_int.outstanding + 1;
-            end if;
-            gpr_write_valid <= gpr_write_valid_in;
-            cr_write_valid <= cr_write_in;
-        else
-            gpr_write_valid <= '0';
-            cr_write_valid <= '0';
+        gpr_write_valid <= gpr_write_valid_in and valid_tmp;
+        cr_write_valid <= cr_write_in and valid_tmp;
+
+        if valid_tmp = '1' and deferred = '0' then
+            v_int.outstanding := v_int.outstanding + 1;
        end if;

        -- update outputs
--- a/core.vhdl
+++ b/core.vhdl
@ -12,8 +12,17 @@ entity core is
 	DISABLE_FLATTEN : boolean := false;
        EX1_BYPASS : boolean := true;
        HAS_FPU : boolean := true;
+        HAS_BTC : boolean := true;
+        HAS_SHORT_MULT : boolean := false;
 	ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0');
-        LOG_LENGTH : natural := 512
+        LOG_LENGTH : natural := 512;
+        ICACHE_NUM_LINES : natural := 64;
+        ICACHE_NUM_WAYS : natural := 2;
+        ICACHE_TLB_SIZE : natural := 64;
+        DCACHE_NUM_LINES : natural := 64;
+        DCACHE_NUM_WAYS : natural := 2;
+        DCACHE_TLB_SET_SIZE : natural := 64;
+        DCACHE_TLB_NUM_WAYS : natural := 2
        );
    port (
        clk          : in std_ulogic;
@ -29,6 +38,8 @@ entity core is
        wishbone_data_in  : in wishbone_slave_out;
        wishbone_data_out : out wishbone_master_out;

+        wb_snoop_in     : in wishbone_master_out;
+
 	dmi_addr	: in std_ulogic_vector(3 downto 0);
 	dmi_din	        : in std_ulogic_vector(63 downto 0);
 	dmi_dout	: out std_ulogic_vector(63 downto 0);
@ -45,6 +56,7 @@ end core;
 architecture behave of core is
    -- icache signals
    signal fetch1_to_icache : Fetch1ToIcacheType;
+    signal writeback_to_fetch1: WritebackToFetch1Type;
    signal icache_to_decode1 : IcacheToDecode1Type;
    signal mmu_to_icache : MmuToIcacheType;

@ -65,7 +77,8 @@ architecture behave of core is

    -- execute signals
    signal execute1_to_writeback: Execute1ToWritebackType;
-    signal execute1_to_fetch1: Execute1ToFetch1Type;
+    signal execute1_bypass: bypass_data_t;
+    signal execute1_cr_bypass: cr_bypass_data_t;

    -- load store signals
    signal execute1_to_loadstore1: Execute1ToLoadstore1Type;
@ -101,22 +114,23 @@ architecture behave of core is
    signal decode1_flush: std_ulogic;
    signal fetch1_flush: std_ulogic;

-    signal complete: std_ulogic;
+    signal complete: instr_tag_t;
    signal terminate: std_ulogic;
    signal core_rst: std_ulogic;
-    signal icache_inv: std_ulogic;
+    signal do_interrupt: std_ulogic;

    -- Delayed/Latched resets and alt_reset
-    signal rst_fetch1  : std_ulogic := '1';
-    signal rst_fetch2  : std_ulogic := '1';
-    signal rst_icache  : std_ulogic := '1';
-    signal rst_dcache  : std_ulogic := '1';
-    signal rst_dec1    : std_ulogic := '1';
-    signal rst_dec2    : std_ulogic := '1';
-    signal rst_ex1     : std_ulogic := '1';
-    signal rst_fpu     : std_ulogic := '1';
-    signal rst_ls1     : std_ulogic := '1';
-    signal rst_dbg     : std_ulogic := '1';
+    signal rst_fetch1  : std_ulogic;
+    signal rst_fetch2  : std_ulogic;
+    signal rst_icache  : std_ulogic;
+    signal rst_dcache  : std_ulogic;
+    signal rst_dec1    : std_ulogic;
+    signal rst_dec2    : std_ulogic;
+    signal rst_ex1     : std_ulogic;
+    signal rst_fpu     : std_ulogic;
+    signal rst_ls1     : std_ulogic;
+    signal rst_wback   : std_ulogic;
+    signal rst_dbg     : std_ulogic;
    signal alt_reset_d : std_ulogic;

    signal sim_cr_dump: std_ulogic;
@ -133,6 +147,12 @@ architecture behave of core is

    signal msr : std_ulogic_vector(63 downto 0);

+    -- PMU event bus
+    signal icache_events    : IcacheEventType;
+    signal loadstore_events : Loadstore1EventType;
+    signal dcache_events    : DcacheEventType;
+    signal writeback_events : WritebackEventType;
+
    -- Debug status
    signal dbg_core_is_stopped: std_ulogic;

@ -179,6 +199,7 @@ begin
            rst_ex1     <= core_rst;
            rst_fpu     <= core_rst;
            rst_ls1     <= core_rst;
+            rst_wback   <= core_rst;
            rst_dbg     <= rst;
            alt_reset_d <= alt_reset;
        end if;
@ -187,7 +208,8 @@ begin
    fetch1_0: entity work.fetch1
        generic map (
            RESET_ADDRESS => (others => '0'),
-	    ALT_RESET_ADDRESS => ALT_RESET_ADDRESS
+	    ALT_RESET_ADDRESS => ALT_RESET_ADDRESS,
+            HAS_BTC => HAS_BTC
            )
        port map (
            clk => clk,
@ -195,9 +217,10 @@ begin
 	    alt_reset_in => alt_reset_d,
            stall_in => fetch1_stall_in,
            flush_in => fetch1_flush,
+            inval_btc => ex1_icache_inval or mmu_to_icache.tlbie,
 	    stop_in => dbg_core_stop,
            d_in => decode1_to_fetch1,
-            e_in => execute1_to_fetch1,
+            w_in => writeback_to_fetch1,
            i_out => fetch1_to_icache,
            log_out => log_data(42 downto 0)
            );
@ -209,8 +232,9 @@ begin
        generic map(
            SIM => SIM,
            LINE_SIZE => 64,
-            NUM_LINES => 64,
-	    NUM_WAYS => 2,
+            NUM_LINES => ICACHE_NUM_LINES,
+            NUM_WAYS => ICACHE_NUM_WAYS,
+            TLB_SIZE => ICACHE_TLB_SIZE,
            LOG_LENGTH => LOG_LENGTH
            )
        port map(
@ -225,6 +249,8 @@ begin
 	    stall_out => icache_stall_out,
            wishbone_out => wishbone_insn_out,
            wishbone_in => wishbone_insn_in,
+            wb_snoop_in => wb_snoop_in,
+            events => icache_events,
            log_out => log_data(96 downto 43)
            );

@ -270,6 +296,8 @@ begin
            r_out => decode2_to_register_file,
            c_in => cr_file_to_decode2,
            c_out => decode2_to_cr_file,
+            execute_bypass => execute1_bypass,
+            execute_cr_bypass => execute1_cr_bypass,
            log_out => log_data(119 downto 110)
            );
    decode2_busy_in <= ex1_busy_out;
@ -312,23 +340,30 @@ begin
        generic map (
            EX1_BYPASS => EX1_BYPASS,
            HAS_FPU => HAS_FPU,
+            HAS_SHORT_MULT => HAS_SHORT_MULT,
            LOG_LENGTH => LOG_LENGTH
            )
        port map (
            clk => clk,
            rst => rst_ex1,
-            flush_out => flush,
+            flush_in => flush,
 	    busy_out => ex1_busy_out,
            e_in => decode2_to_execute1,
            l_in => loadstore1_to_execute1,
            fp_in => fpu_to_execute1,
            ext_irq_in => ext_irq,
+            interrupt_in => do_interrupt,
            l_out => execute1_to_loadstore1,
-            f_out => execute1_to_fetch1,
            fp_out => execute1_to_fpu,
            e_out => execute1_to_writeback,
+            bypass_data => execute1_bypass,
+            bypass_cr_data => execute1_cr_bypass,
 	    icache_inval => ex1_icache_inval,
            dbg_msr_out => msr,
+            wb_events => writeback_events,
+            ls_events => loadstore_events,
+            dc_events => dcache_events,
+            ic_events => icache_events,
            terminate_out => terminate,
            log_out => log_data(134 downto 120),
            log_rd_addr => log_rd_addr,
@ -350,13 +385,8 @@ begin

    no_fpu: if not HAS_FPU generate
    begin
-        fpu_to_execute1.busy <= '0';
-        fpu_to_execute1.exception <= '0';
-        fpu_to_execute1.interrupt <= '0';
-        fpu_to_execute1.illegal <= '0';
-        fpu_to_writeback.valid <= '0';
-        fpu_to_writeback.write_enable <= '0';
-        fpu_to_writeback.write_cr_enable <= '0';
+        fpu_to_execute1 <= FPUToExecute1Init;
+        fpu_to_writeback <= FPUToWritebackInit;
    end generate;

    loadstore1_0: entity work.loadstore1
@ -375,6 +405,7 @@ begin
            m_out => loadstore1_to_mmu,
            m_in => mmu_to_loadstore1,
            dc_stall => dcache_stall_out,
+            events => loadstore_events,
            log_out => log_data(149 downto 140)
            );

@ -392,8 +423,10 @@ begin
    dcache_0: entity work.dcache
        generic map(
            LINE_SIZE => 64,
-            NUM_LINES => 64,
-	    NUM_WAYS => 2,
+            NUM_LINES => DCACHE_NUM_LINES,
+            NUM_WAYS => DCACHE_NUM_WAYS,
+            TLB_SET_SIZE => DCACHE_TLB_SET_SIZE,
+            TLB_NUM_WAYS => DCACHE_TLB_NUM_WAYS,
            LOG_LENGTH => LOG_LENGTH
            )
        port map (
@ -406,17 +439,24 @@ begin
            stall_out => dcache_stall_out,
            wishbone_in => wishbone_data_in,
            wishbone_out => wishbone_data_out,
+            snoop_in => wb_snoop_in,
+            events => dcache_events,
            log_out => log_data(170 downto 151)
            );

    writeback_0: entity work.writeback
        port map (
            clk => clk,
+            rst => rst_wback,
+            flush_out => flush,
            e_in => execute1_to_writeback,
            l_in => loadstore1_to_writeback,
            fp_in => fpu_to_writeback,
            w_out => writeback_to_register_file,
            c_out => writeback_to_cr_file,
+            f_out => writeback_to_fetch1,
+            events => writeback_events,
+            interrupt_out => do_interrupt,
            complete_out => complete
            );

--- a/core_debug.vhdl
+++ b/core_debug.vhdl
@ -12,25 +12,25 @@ entity core_debug is
        LOG_LENGTH : natural := 512
        );
    port (
-        clk          : in std_logic;
-        rst          : in std_logic;
-
-	dmi_addr	: in std_ulogic_vector(3 downto 0);
-	dmi_din	        : in std_ulogic_vector(63 downto 0);
-	dmi_dout	: out std_ulogic_vector(63 downto 0);
-	dmi_req	        : in std_ulogic;
-	dmi_wr		: in std_ulogic;
-	dmi_ack	        : out std_ulogic;
-
-	-- Debug actions
-	core_stop       : out std_ulogic;
-	core_rst        : out std_ulogic;
-	icache_rst      : out std_ulogic;
-
-	-- Core status inputs
-	terminate       : in std_ulogic;
-	core_stopped    : in std_ulogic;
-	nia             : in std_ulogic_vector(63 downto 0);
+        clk             : in std_logic;
+        rst             : in std_logic;
+
+        dmi_addr        : in std_ulogic_vector(3 downto 0);
+        dmi_din         : in std_ulogic_vector(63 downto 0);
+        dmi_dout        : out std_ulogic_vector(63 downto 0);
+        dmi_req         : in std_ulogic;
+        dmi_wr          : in std_ulogic;
+        dmi_ack         : out std_ulogic;
+
+        -- Debug actions
+        core_stop       : out std_ulogic;
+        core_rst        : out std_ulogic;
+        icache_rst      : out std_ulogic;
+
+        -- Core status inputs
+        terminate       : in std_ulogic;
+        core_stopped    : in std_ulogic;
+        nia             : in std_ulogic_vector(63 downto 0);
        msr             : in std_ulogic_vector(63 downto 0);

        -- GSPR register read port
@ -45,8 +45,8 @@ entity core_debug is
        log_read_data   : out std_ulogic_vector(63 downto 0);
        log_write_addr  : out std_ulogic_vector(31 downto 0);

-	-- Misc
-	terminated_out  : out std_ulogic
+        -- Misc
+        terminated_out  : out std_ulogic
        );
 end core_debug;

@ -60,7 +60,7 @@ architecture behave of core_debug is
    -- bit     2 : Icache reset
    -- bit     3 : Single step
    -- bit     4 : Core start
-    constant DBG_CORE_CTRL	   : std_ulogic_vector(3 downto 0) := "0000";
+    constant DBG_CORE_CTRL         : std_ulogic_vector(3 downto 0) := "0000";
    constant DBG_CORE_CTRL_STOP    : integer := 0;
    constant DBG_CORE_CTRL_RESET   : integer := 1;
    constant DBG_CORE_CTRL_ICRESET : integer := 2;
@ -71,13 +71,13 @@ architecture behave of core_debug is
    -- bit    0 : Core stopping (wait til bit 1 set)
    -- bit    1 : Core stopped
    -- bit    2 : Core terminated (clears with start or reset)
-    constant DBG_CORE_STAT	   : std_ulogic_vector(3 downto 0) := "0001";
+    constant DBG_CORE_STAT           : std_ulogic_vector(3 downto 0) := "0001";
    constant DBG_CORE_STAT_STOPPING  : integer := 0;
    constant DBG_CORE_STAT_STOPPED   : integer := 1;
    constant DBG_CORE_STAT_TERM      : integer := 2;

    -- NIA register (read only for now)
-    constant DBG_CORE_NIA	     : std_ulogic_vector(3 downto 0) := "0010";
+    constant DBG_CORE_NIA             : std_ulogic_vector(3 downto 0) := "0010";

    -- MSR (read only)
    constant DBG_CORE_MSR            : std_ulogic_vector(3 downto 0) := "0011";
@ -91,6 +91,7 @@ architecture behave of core_debug is
    -- Log buffer address and data registers
    constant DBG_CORE_LOG_ADDR       : std_ulogic_vector(3 downto 0) := "0110";
    constant DBG_CORE_LOG_DATA       : std_ulogic_vector(3 downto 0) := "0111";
+    constant DBG_CORE_LOG_TRIGGER    : std_ulogic_vector(3 downto 0) := "1000";

    constant LOG_INDEX_BITS : natural := log2(LOG_LENGTH);

@ -106,11 +107,14 @@ architecture behave of core_debug is
    signal do_gspr_rd   : std_ulogic;
    signal gspr_index   : gspr_index_t;

-    signal log_dmi_addr : std_ulogic_vector(31 downto 0) := (others => '0');
-    signal log_dmi_data : std_ulogic_vector(63 downto 0) := (others => '0');
-    signal do_dmi_log_rd : std_ulogic;
-    signal dmi_read_log_data : std_ulogic;
+    signal log_dmi_addr        : std_ulogic_vector(31 downto 0) := (others => '0');
+    signal log_dmi_data        : std_ulogic_vector(63 downto 0) := (others => '0');
+    signal log_dmi_trigger     : std_ulogic_vector(63 downto 0) := (others => '0');
+    signal do_log_trigger      : std_ulogic := '0';
+    signal do_dmi_log_rd       : std_ulogic;
+    signal dmi_read_log_data   : std_ulogic;
    signal dmi_read_log_data_1 : std_ulogic;
+    signal log_trigger_delay   : integer range 0 to 255 := 0;

 begin
       -- Single cycle register accesses on DMI except for GSPR data
@ -121,76 +125,89 @@ begin

    -- Status register read composition
    stat_reg <= (2 => terminated,
-		 1 => core_stopped,
-		 0 => stopping,
-		 others => '0');
+                 1 => core_stopped,
+                 0 => stopping,
+                 others => '0');

    -- DMI read data mux
    with dmi_addr select dmi_dout <=
-	stat_reg        when DBG_CORE_STAT,
-	nia             when DBG_CORE_NIA,
+        stat_reg        when DBG_CORE_STAT,
+        nia             when DBG_CORE_NIA,
        msr             when DBG_CORE_MSR,
        dbg_gpr_data    when DBG_CORE_GSPR_DATA,
        log_write_addr & log_dmi_addr when DBG_CORE_LOG_ADDR,
        log_dmi_data    when DBG_CORE_LOG_DATA,
-	(others => '0') when others;
+        log_dmi_trigger when DBG_CORE_LOG_TRIGGER,
+        (others => '0') when others;

    -- DMI writes
    reg_write: process(clk)
    begin
-	if rising_edge(clk) then
-	    -- Reset the 1-cycle "do" signals
-	    do_step <= '0';
-	    do_reset <= '0';
-	    do_icreset <= '0';
+        if rising_edge(clk) then
+            -- Reset the 1-cycle "do" signals
+            do_step <= '0';
+            do_reset <= '0';
+            do_icreset <= '0';
            do_dmi_log_rd <= '0';

-	    if (rst) then
-		stopping <= '0';
-		terminated <= '0';
-	    else
-		-- Edge detect on dmi_req for 1-shot pulses
-		dmi_req_1 <= dmi_req;
-		if dmi_req = '1' and dmi_req_1 = '0' then
-		    if dmi_wr = '1' then
-			report("DMI write to " & to_hstring(dmi_addr));
-
-			-- Control register actions
-			if dmi_addr = DBG_CORE_CTRL then
-			    if dmi_din(DBG_CORE_CTRL_RESET) = '1' then
-				do_reset <= '1';
-				terminated <= '0';
-			    end if;
-			    if dmi_din(DBG_CORE_CTRL_STOP) = '1' then
-				stopping <= '1';
-			    end if;
-			    if dmi_din(DBG_CORE_CTRL_STEP) = '1' then
-				do_step <= '1';
-				terminated <= '0';
-			    end if;
-			    if dmi_din(DBG_CORE_CTRL_ICRESET) = '1' then
-				do_icreset <= '1';
-			    end if;
-			    if dmi_din(DBG_CORE_CTRL_START) = '1' then
-				stopping <= '0';
-				terminated <= '0';
-			    end if;
+            if (rst) then
+                stopping <= '0';
+                terminated <= '0';
+                log_trigger_delay <= 0;
+                gspr_index <= (others => '0');
+            else
+                if do_log_trigger = '1' or log_trigger_delay /= 0 then
+                    if log_trigger_delay = 255 then
+                        log_dmi_trigger(1) <= '1';
+                        log_trigger_delay <= 0;
+                    else
+                        log_trigger_delay <= log_trigger_delay + 1;
+                    end if;
+                end if;
+                -- Edge detect on dmi_req for 1-shot pulses
+                dmi_req_1 <= dmi_req;
+                if dmi_req = '1' and dmi_req_1 = '0' then
+                    if dmi_wr = '1' then
+                        report("DMI write to " & to_hstring(dmi_addr));
+
+                        -- Control register actions
+                        if dmi_addr = DBG_CORE_CTRL then
+                            if dmi_din(DBG_CORE_CTRL_RESET) = '1' then
+                                do_reset <= '1';
+                                terminated <= '0';
+                            end if;
+                            if dmi_din(DBG_CORE_CTRL_STOP) = '1' then
+                                stopping <= '1';
+                            end if;
+                            if dmi_din(DBG_CORE_CTRL_STEP) = '1' then
+                                do_step <= '1';
+                                terminated <= '0';
+                            end if;
+                            if dmi_din(DBG_CORE_CTRL_ICRESET) = '1' then
+                                do_icreset <= '1';
+                            end if;
+                            if dmi_din(DBG_CORE_CTRL_START) = '1' then
+                                stopping <= '0';
+                                terminated <= '0';
+                            end if;
                        elsif dmi_addr = DBG_CORE_GSPR_INDEX then
                            gspr_index <= dmi_din(gspr_index_t'left downto 0);
                        elsif dmi_addr = DBG_CORE_LOG_ADDR then
                            log_dmi_addr <= dmi_din(31 downto 0);
                            do_dmi_log_rd <= '1';
-			end if;
-		    else
-			report("DMI read from " & to_string(dmi_addr));
-		    end if;
+                        elsif dmi_addr = DBG_CORE_LOG_TRIGGER then
+                            log_dmi_trigger <= dmi_din;
+                        end if;
+                    else
+                        report("DMI read from " & to_string(dmi_addr));
+                    end if;

                elsif dmi_read_log_data = '0' and dmi_read_log_data_1 = '1' then
                    -- Increment log_dmi_addr after the end of a read from DBG_CORE_LOG_DATA
                    log_dmi_addr(LOG_INDEX_BITS + 1 downto 0) <=
                        std_ulogic_vector(unsigned(log_dmi_addr(LOG_INDEX_BITS+1 downto 0)) + 1);
                    do_dmi_log_rd <= '1';
-		end if;
+                end if;
                dmi_read_log_data_1 <= dmi_read_log_data;
                if dmi_req = '1' and dmi_addr = DBG_CORE_LOG_DATA then
                    dmi_read_log_data <= '1';
@ -198,15 +215,15 @@ begin
                    dmi_read_log_data <= '0';
                end if;

-		-- Set core stop on terminate. We'll be stopping some time *after*
-		-- the offending instruction, at least until we can do back flushes
-		-- that preserve NIA which we can't just yet.
-		if terminate = '1' then
-		    stopping <= '1';
-		    terminated <= '1';
-		end if;
-	    end if;
-	end if;
+                -- Set core stop on terminate. We'll be stopping some time *after*
+                -- the offending instruction, at least until we can do back flushes
+                -- that preserve NIA which we can't just yet.
+                if terminate = '1' then
+                    stopping <= '1';
+                    terminated <= '1';
+                end if;
+            end if;
+        end if;
    end process;

    dbg_gpr_addr <= gspr_index;
@ -221,15 +238,15 @@ begin
    maybe_log: if LOG_LENGTH > 0 generate
        subtype log_ptr_t is unsigned(LOG_INDEX_BITS - 1 downto 0);
        type log_array_t is array(0 to LOG_LENGTH - 1) of std_ulogic_vector(255 downto 0);
-        signal log_array    : log_array_t;
-        signal log_rd_ptr   : log_ptr_t;
-        signal log_wr_ptr   : log_ptr_t;
-        signal log_toggle   : std_ulogic;
-        signal log_wr_enable : std_ulogic;
+        signal log_array          : log_array_t;
+        signal log_rd_ptr         : log_ptr_t;
+        signal log_wr_ptr         : log_ptr_t;
+        signal log_toggle         : std_ulogic;
+        signal log_wr_enable      : std_ulogic;
        signal log_rd_ptr_latched : log_ptr_t;
-        signal log_rd       : std_ulogic_vector(255 downto 0);
-        signal log_dmi_reading : std_ulogic;
-        signal log_dmi_read_done : std_ulogic;
+        signal log_rd             : std_ulogic_vector(255 downto 0);
+        signal log_dmi_reading    : std_ulogic;
+        signal log_dmi_read_done  : std_ulogic;

        function select_dword(data : std_ulogic_vector(255 downto 0);
                              addr : std_ulogic_vector(31 downto 0)) return std_ulogic_vector is
@ -246,7 +263,7 @@ begin

    begin
        -- Use MSB of read addresses to stop the logging
-        log_wr_enable <= not (log_read_addr(31) or log_dmi_addr(31));
+        log_wr_enable <= not (log_read_addr(31) or log_dmi_addr(31) or log_dmi_trigger(1));

        log_ram: process(clk)
        begin
@ -285,6 +302,12 @@ begin
                end if;
                log_dmi_read_done <= log_dmi_reading;
                log_dmi_reading <= do_dmi_log_rd;
+                do_log_trigger <= '0';
+                if log_data(42) = log_dmi_trigger(63) and
+                    log_data(41 downto 0) = log_dmi_trigger(43 downto 2) and
+                    log_dmi_trigger(0) = '1' then
+                    do_log_trigger <= '1';
+                end if;
            end if;
        end process;
        log_write_addr(LOG_INDEX_BITS - 1 downto 0) <= std_ulogic_vector(log_wr_ptr);
--- a/core_dram_tb.vhdl
+++ b/core_dram_tb.vhdl
@ -9,7 +9,7 @@ use work.utils.all;

 entity core_dram_tb is
    generic (
-	MEMORY_SIZE    : natural := (384*1024);
+        MEMORY_SIZE    : natural := (384*1024);
        MAIN_RAM_FILE  : string  := "main_ram.bin";
        DRAM_INIT_FILE : string  := "";
        DRAM_INIT_SIZE : natural := 16#c000#
@ -57,25 +57,25 @@ architecture behave of core_dram_tb is
 begin

    soc0: entity work.soc
-	generic map(
-	    SIM => true,
-	    MEMORY_SIZE => MEMORY_SIZE,
-	    RAM_INIT_FILE => MAIN_RAM_FILE,
+        generic map(
+            SIM => true,
+            MEMORY_SIZE => MEMORY_SIZE,
+            RAM_INIT_FILE => MAIN_RAM_FILE,
            HAS_DRAM => true,
-	    DRAM_SIZE => 256 * 1024 * 1024,
+            DRAM_SIZE => 256 * 1024 * 1024,
            DRAM_INIT_SIZE => ROM_SIZE,
-	    CLK_FREQ => 100000000,
+            CLK_FREQ => 100000000,
            HAS_SPI_FLASH    => true,
            SPI_FLASH_DLINES => 4,
            SPI_FLASH_OFFSET => 0
-	    )
-	port map(
-	    rst => soc_rst,
-	    system_clk => system_clk,
-	    wb_dram_in => wb_dram_in,
-	    wb_dram_out => wb_dram_out,
-	    wb_ext_io_in => wb_ext_io_in,
-	    wb_ext_io_out => wb_ext_io_out,
+            )
+        port map(
+            rst => soc_rst,
+            system_clk => system_clk,
+            wb_dram_in => wb_dram_in,
+            wb_dram_out => wb_dram_out,
+            wb_ext_io_in => wb_ext_io_in,
+            wb_ext_io_out => wb_ext_io_out,
            wb_ext_is_dram_csr => wb_ext_is_dram_csr,
            wb_ext_is_dram_init => wb_ext_is_dram_init,
            spi_flash_sck     => spi_sck,
@ -83,8 +83,8 @@ begin
            spi_flash_sdat_o  => spi_sdat_o,
            spi_flash_sdat_oe => spi_sdat_oe,
            spi_flash_sdat_i  => spi_sdat_i,
-	    alt_reset => core_alt_reset
-	    );
+            alt_reset => core_alt_reset
+            );

        flash: entity work.s25fl128s
        generic map (
@ -121,6 +121,7 @@ begin
            DRAM_ABITS => 24,
            DRAM_ALINES => 1,
            DRAM_DLINES => 16,
+            DRAM_CKLINES => 1,
            DRAM_PORT_WIDTH => 128,
            PAYLOAD_FILE => DRAM_INIT_FILE,
            PAYLOAD_SIZE => ROM_SIZE
@ -142,18 +143,18 @@ begin

    clk_process: process
    begin
-	clk <= '0';
-	wait for clk_period/2;
-	clk <= '1';
-	wait for clk_period/2;
+        clk <= '0';
+        wait for clk_period/2;
+        clk <= '1';
+        wait for clk_period/2;
    end process;

    rst_process: process
    begin
-	rst <= '1';
-	wait for 10*clk_period;
-	rst <= '0';
-	wait;
+        rst <= '1';
+        wait for 10*clk_period;
+        rst <= '0';
+        wait;
    end process;

    jtag: entity work.sim_jtag;
--- a/core_flash_tb.vhdl
+++ b/core_flash_tb.vhdl
@ -10,10 +10,10 @@ entity core_flash_tb is
 end core_flash_tb;

 architecture behave of core_flash_tb is
-	signal clk, rst: std_logic;
+        signal clk, rst: std_logic;

-	-- testbench signals
-	constant clk_period : time := 10 ns;
+        -- testbench signals
+        constant clk_period : time := 10 ns;

        -- SPI
        signal spi_sck     : std_ulogic;
@ -28,24 +28,24 @@ architecture behave of core_flash_tb is
 begin

    soc0: entity work.soc
-	generic map(
-	    SIM => true,
-	    MEMORY_SIZE => (384*1024),
-	    RAM_INIT_FILE => "main_ram.bin",
-	    CLK_FREQ => 100000000,
+        generic map(
+            SIM => true,
+            MEMORY_SIZE => (384*1024),
+            RAM_INIT_FILE => "main_ram.bin",
+            CLK_FREQ => 100000000,
            HAS_SPI_FLASH    => true,
            SPI_FLASH_DLINES => 4,
            SPI_FLASH_OFFSET => 0
-	    )
-	port map(
-	    rst => rst,
-	    system_clk => clk,
+            )
+        port map(
+            rst => rst,
+            system_clk => clk,
            spi_flash_sck     => spi_sck,
            spi_flash_cs_n    => spi_cs_n,
            spi_flash_sdat_o  => spi_sdat_o,
            spi_flash_sdat_oe => spi_sdat_oe,
            spi_flash_sdat_i  => spi_sdat_i
-	    );
+            );

    flash: entity work.s25fl128s
        generic map (
@ -78,18 +78,18 @@ begin
    
    clk_process: process
    begin
-	clk <= '0';
-	wait for clk_period/2;
-	clk <= '1';
-	wait for clk_period/2;
+        clk <= '0';
+        wait for clk_period/2;
+        clk <= '1';
+        wait for clk_period/2;
    end process;

    rst_process: process
    begin
-	rst <= '1';
-	wait for 10*clk_period;
-	rst <= '0';
-	wait;
+        rst <= '1';
+        wait for 10*clk_period;
+        rst <= '0';
+        wait;
    end process;

    jtag: entity work.sim_jtag;
--- a/core_tb.vhdl
+++ b/core_tb.vhdl
@ -10,38 +10,38 @@ entity core_tb is
 end core_tb;

 architecture behave of core_tb is
-	signal clk, rst: std_logic;
+        signal clk, rst: std_logic;

-	-- testbench signals
-	constant clk_period : time := 10 ns;
+        -- testbench signals
+        constant clk_period : time := 10 ns;
 begin

    soc0: entity work.soc
-	generic map(
-	    SIM => true,
-	    MEMORY_SIZE => (384*1024),
-	    RAM_INIT_FILE => "main_ram.bin",
-	    CLK_FREQ => 100000000
-	    )
-	port map(
-	    rst => rst,
-	    system_clk => clk
-	    );
+        generic map(
+            SIM => true,
+            MEMORY_SIZE => (384*1024),
+            RAM_INIT_FILE => "main_ram.bin",
+            CLK_FREQ => 100000000
+            )
+        port map(
+            rst => rst,
+            system_clk => clk
+            );

    clk_process: process
    begin
-	clk <= '0';
-	wait for clk_period/2;
-	clk <= '1';
-	wait for clk_period/2;
+        clk <= '0';
+        wait for clk_period/2;
+        clk <= '1';
+        wait for clk_period/2;
    end process;

    rst_process: process
    begin
-	rst <= '1';
-	wait for 10*clk_period;
-	rst <= '0';
-	wait;
+        rst <= '1';
+        wait for 10*clk_period;
+        rst <= '0';
+        wait;
    end process;

    jtag: entity work.sim_jtag;
--- a/countbits.vhdl
+++ b/countbits.vhdl
@ -0,0 +1,136 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.helpers.all;
+
+entity bit_counter is
+    port (
+        clk         : in std_logic;
+        rs          : in std_ulogic_vector(63 downto 0);
+        count_right : in std_ulogic;
+        do_popcnt   : in std_ulogic;
+        is_32bit    : in std_ulogic;
+        datalen     : in std_ulogic_vector(3 downto 0);
+        result      : out std_ulogic_vector(63 downto 0)
+        );
+end entity bit_counter;
+
+architecture behaviour of bit_counter is
+    -- signals for count-leading/trailing-zeroes
+    signal inp : std_ulogic_vector(63 downto 0);
+    signal inp_r : std_ulogic_vector(63 downto 0);
+    signal sum : std_ulogic_vector(64 downto 0);
+    signal sum_r : std_ulogic_vector(64 downto 0);
+    signal onehot : std_ulogic_vector(63 downto 0);
+    signal edge : std_ulogic_vector(63 downto 0);
+    signal bitnum : std_ulogic_vector(5 downto 0);
+    signal cntz : std_ulogic_vector(63 downto 0);
+
+    -- signals for popcnt
+    signal dlen_r   : std_ulogic_vector(3 downto 0);
+    signal pcnt_r   : std_ulogic;
+    subtype twobit is unsigned(1 downto 0);
+    type twobit32 is array(0 to 31) of twobit;
+    signal pc2      : twobit32;
+    subtype threebit is unsigned(2 downto 0);
+    type threebit16 is array(0 to 15) of threebit;
+    signal pc4      : threebit16;
+    subtype fourbit is unsigned(3 downto 0);
+    type fourbit8 is array(0 to 7) of fourbit;
+    signal pc8      : fourbit8;
+    signal pc8_r    : fourbit8;
+    subtype sixbit is unsigned(5 downto 0);
+    type sixbit2 is array(0 to 1) of sixbit;
+    signal pc32     : sixbit2;
+    signal popcnt   : std_ulogic_vector(63 downto 0);
+
+begin
+    countzero_r: process(clk)
+    begin
+        if rising_edge(clk) then
+            inp_r <= inp;
+            sum_r <= sum;
+        end if;
+    end process;
+
+    countzero: process(all)
+        variable bitnum_e, bitnum_o : std_ulogic_vector(5 downto 0);
+    begin
+        if is_32bit = '0' then
+            if count_right = '0' then
+                inp <= bit_reverse(rs);
+            else
+                inp <= rs;
+            end if;
+        else
+            inp(63 downto 32) <= x"FFFFFFFF";
+            if count_right = '0' then
+                inp(31 downto 0) <= bit_reverse(rs(31 downto 0));
+            else
+                inp(31 downto 0) <= rs(31 downto 0);
+            end if;
+        end if;
+
+        sum <= std_ulogic_vector(unsigned('0' & not inp) + 1);
+
+        -- The following occurs after a clock edge
+        edge <= sum_r(63 downto 0) or inp_r;
+        bitnum_e := edgelocation(edge, 6);
+        onehot <= sum_r(63 downto 0) and inp_r;
+        bitnum_o := bit_number(onehot);
+        bitnum(5 downto 2) <= bitnum_e(5 downto 2);
+        bitnum(1 downto 0) <= bitnum_o(1 downto 0);
+
+        cntz <= 57x"0" & sum_r(64) & bitnum;
+    end process;
+
+    popcnt_r: process(clk)
+    begin
+        if rising_edge(clk) then
+            for i in 0 to 7 loop
+                pc8_r(i) <= pc8(i);
+            end loop;
+            dlen_r <= datalen;
+            pcnt_r <= do_popcnt;
+        end if;
+    end process;
+
+    popcnt_a: process(all)
+    begin
+        for i in 0 to 31 loop
+            pc2(i) <= unsigned("0" & rs(i * 2 downto i * 2)) + unsigned("0" & rs(i * 2 + 1 downto i * 2 + 1));
+        end loop;
+        for i in 0 to 15 loop
+            pc4(i) <= ('0' & pc2(i * 2)) + ('0' & pc2(i * 2 + 1));
+        end loop;
+        for i in 0 to 7 loop
+            pc8(i) <= ('0' & pc4(i * 2)) + ('0' & pc4(i * 2 + 1));
+        end loop;
+
+        -- after a clock edge
+        for i in 0 to 1 loop
+            pc32(i) <= ("00" & pc8_r(i * 4)) + ("00" & pc8_r(i * 4 + 1)) +
+                       ("00" & pc8_r(i * 4 + 2)) + ("00" & pc8_r(i * 4 + 3));
+        end loop;
+        
+        popcnt <= (others => '0');
+        if dlen_r(3 downto 2) = "00" then
+            -- popcntb
+            for i in 0 to 7 loop
+                popcnt(i * 8 + 3 downto i * 8) <= std_ulogic_vector(pc8_r(i));
+            end loop;
+        elsif dlen_r(3) = '0' then
+            -- popcntw
+            for i in 0 to 1 loop
+                popcnt(i * 32 + 5 downto i * 32) <= std_ulogic_vector(pc32(i));
+            end loop;
+        else
+            popcnt(6 downto 0) <= std_ulogic_vector(('0' & pc32(0)) + ('0' & pc32(1)));
+        end if;
+    end process;
+
+    result <= cntz when pcnt_r = '0' else popcnt;
+
+end behaviour;
--- a/countbits_tb.vhdl
+++ b/countbits_tb.vhdl
@ -0,0 +1,118 @@
+library vunit_lib;
+context vunit_lib.vunit_context;
+
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.common.all;
+
+library osvvm;
+use osvvm.RandomPkg.all;
+
+entity countbits_tb is
+    generic (runner_cfg : string := runner_cfg_default);
+end countbits_tb;
+
+architecture behave of countbits_tb is
+    constant clk_period: time := 10 ns;
+    signal rs: std_ulogic_vector(63 downto 0);
+    signal is_32bit, count_right: std_ulogic := '0';
+    signal res: std_ulogic_vector(63 downto 0);
+    signal clk: std_ulogic;
+
+begin
+    bitcounter_0: entity work.bit_counter
+        port map (
+            clk => clk,
+            rs => rs,
+            result => res,
+            count_right => count_right,
+            is_32bit => is_32bit,
+            do_popcnt => '0',
+            datalen => "0000"
+        );
+
+    clk_process: process
+    begin
+        clk <= '0';
+        wait for clk_period/2;
+        clk <= '1';
+        wait for clk_period/2;
+    end process;
+
+    stim_process: process
+        variable r: std_ulogic_vector(63 downto 0);
+        variable rnd : RandomPType;
+    begin
+        rnd.InitSeed(stim_process'path_name);
+
+        test_runner_setup(runner, runner_cfg);
+
+        while test_suite loop
+            if run("Test with input = 0") then
+                rs <= (others => '0');
+                is_32bit <= '0';
+                count_right <= '0';
+                wait for clk_period;
+                check_equal(res, 16#40#, result("for cntlzd"));
+                count_right <= '1';
+                wait for clk_period;
+                check_equal(res, 16#40#, result("for cnttzd"));
+                is_32bit <= '1';
+                count_right <= '0';
+                wait for clk_period;
+                check_equal(res, 16#20#, result("for cntlzw"));
+                count_right <= '1';
+                wait for clk_period;
+                check_equal(res, 16#20#, result("for cnttzw"));
+
+            elsif run("Test cntlzd/w") then
+                count_right <= '0';
+                for j in 0 to 100 loop
+                    r := rnd.RandSlv(64);
+                    r(63) := '1';
+                    for i in 0 to 63 loop
+                        rs <= r;
+                        is_32bit <= '0';
+                        wait for clk_period;
+                        check_equal(res, i, result("for cntlzd " & to_hstring(rs)));
+                        rs <= r(31 downto 0) & r(63 downto 32);
+                        is_32bit <= '1';
+                        wait for clk_period;
+                        if i < 32 then
+                            check_equal(res, i, result("for cntlzw " & to_hstring(rs)));
+                        else
+                            check_equal(res, 32, result("for cntlzw " & to_hstring(rs)));
+                        end if;
+                        r := '0' & r(63 downto 1);
+                    end loop;
+                end loop;
+
+            elsif run("Test cnttzd/w") then
+                count_right <= '1';
+                for j in 0 to 100 loop
+                    r := rnd.RandSlv(64);
+                    r(0) := '1';
+                    for i in 0 to 63 loop
+                        rs <= r;
+                        is_32bit <= '0';
+                        wait for clk_period;
+                        check_equal(res, i, result("for cnttzd " & to_hstring(rs)));
+                        is_32bit <= '1';
+                        wait for clk_period;
+                        if i < 32 then
+                            check_equal(res, i, result("for cnttzw " & to_hstring(rs)));
+                        else
+                            check_equal(res, 32, result("for cnttzw " & to_hstring(rs)));
+                        end if;
+                        r := r(62 downto 0) & '0';
+                    end loop;
+                end loop;
+            end if;
+        end loop;
+
+        test_runner_cleanup(runner);
+    end process;
+end behave;
--- a/countzero.vhdl
+++ b/countzero.vhdl
@ -1,60 +0,0 @@
-library ieee;
-use ieee.std_logic_1164.all;
-use ieee.numeric_std.all;
-
-library work;
-use work.helpers.all;
-
-entity zero_counter is
-    port (
-        clk         : in std_logic;
-	rs          : in std_ulogic_vector(63 downto 0);
-	count_right : in std_ulogic;
-	is_32bit    : in std_ulogic;
-	result      : out std_ulogic_vector(63 downto 0)
-	);
-end entity zero_counter;
-
-architecture behaviour of zero_counter is
-    signal inp : std_ulogic_vector(63 downto 0);
-    signal sum : std_ulogic_vector(64 downto 0);
-    signal msb_r : std_ulogic;
-    signal onehot : std_ulogic_vector(63 downto 0);
-    signal onehot_r : std_ulogic_vector(63 downto 0);
-    signal bitnum : std_ulogic_vector(5 downto 0);
-
-begin
-    countzero_r: process(clk)
-    begin
-        if rising_edge(clk) then
-            msb_r <= sum(64);
-            onehot_r <= onehot;
-        end if;
-    end process;
-
-    countzero: process(all)
-    begin
-        if is_32bit = '0' then
-            if count_right = '0' then
-                inp <= bit_reverse(rs);
-            else
-                inp <= rs;
-            end if;
-        else
-            inp(63 downto 32) <= x"FFFFFFFF";
-            if count_right = '0' then
-                inp(31 downto 0) <= bit_reverse(rs(31 downto 0));
-            else
-                inp(31 downto 0) <= rs(31 downto 0);
-            end if;
-        end if;
-
-        sum <= std_ulogic_vector(unsigned('0' & not inp) + 1);
-        onehot <= sum(63 downto 0) and inp;
-
-        -- The following occurs after a clock edge
-        bitnum <= bit_number(onehot_r);
-
-        result <= x"00000000000000" & "0" & msb_r & bitnum;
-    end process;
-end behaviour;
--- a/countzero_tb.vhdl
+++ b/countzero_tb.vhdl
@ -1,114 +0,0 @@
-library ieee;
-use ieee.std_logic_1164.all;
-use ieee.numeric_std.all;
-
-library work;
-use work.common.all;
-use work.glibc_random.all;
-
-entity countzero_tb is
-end countzero_tb;
-
-architecture behave of countzero_tb is
-    constant clk_period: time := 10 ns;
-    signal rs: std_ulogic_vector(63 downto 0);
-    signal is_32bit, count_right: std_ulogic := '0';
-    signal result: std_ulogic_vector(63 downto 0);
-    signal randno: std_ulogic_vector(63 downto 0);
-    signal clk: std_ulogic;
-
-begin
-    zerocounter_0: entity work.zero_counter
-	port map (
-            clk => clk,
-	    rs => rs,
-	    result => result,
-	    count_right => count_right,
-	    is_32bit => is_32bit
-	);
-
-    clk_process: process
-    begin
-        clk <= '0';
-        wait for clk_period/2;
-        clk <= '1';
-        wait for clk_period/2;
-    end process;
-
-    stim_process: process
-        variable r: std_ulogic_vector(63 downto 0);
-    begin
-        -- test with input = 0
-        report "test zero input";
-        rs <= (others => '0');
-        is_32bit <= '0';
-        count_right <= '0';
-        wait for clk_period;
-        assert result = x"0000000000000040"
-            report "bad cntlzd 0 = " & to_hstring(result);
-        count_right <= '1';
-        wait for clk_period;
-        assert result = x"0000000000000040"
-            report "bad cnttzd 0 = " & to_hstring(result);
-        is_32bit <= '1';
-        count_right <= '0';
-        wait for clk_period;
-        assert result = x"0000000000000020"
-            report "bad cntlzw 0 = " & to_hstring(result);
-        count_right <= '1';
-        wait for clk_period;
-        assert result = x"0000000000000020"
-            report "bad cnttzw 0 = " & to_hstring(result);
-
-        report "test cntlzd/w";
-        count_right <= '0';
-        for j in 0 to 100 loop
-            r := pseudorand(64);
-            r(63) := '1';
-            for i in 0 to 63 loop
-                rs <= r;
-                is_32bit <= '0';
-                wait for clk_period;
-                assert to_integer(unsigned(result)) = i
-                    report "bad cntlzd " & to_hstring(rs) & " -> " & to_hstring(result);
-                rs <= r(31 downto 0) & r(63 downto 32);
-                is_32bit <= '1';
-                wait for clk_period;
-                if i < 32 then
-                    assert to_integer(unsigned(result)) = i
-                        report "bad cntlzw " & to_hstring(rs) & " -> " & to_hstring(result);
-                else
-                    assert to_integer(unsigned(result)) = 32
-                        report "bad cntlzw " & to_hstring(rs) & " -> " & to_hstring(result);
-                end if;
-                r := '0' & r(63 downto 1);
-            end loop;
-        end loop;
-
-        report "test cnttzd/w";
-        count_right <= '1';
-        for j in 0 to 100 loop
-            r := pseudorand(64);
-            r(0) := '1';
-            for i in 0 to 63 loop
-                rs <= r;
-                is_32bit <= '0';
-                wait for clk_period;
-                assert to_integer(unsigned(result)) = i
-                    report "bad cnttzd " & to_hstring(rs) & " -> " & to_hstring(result);
-                is_32bit <= '1';
-                wait for clk_period;
-                if i < 32 then
-                    assert to_integer(unsigned(result)) = i
-                        report "bad cnttzw " & to_hstring(rs) & " -> " & to_hstring(result);
-                else
-                    assert to_integer(unsigned(result)) = 32
-                        report "bad cnttzw " & to_hstring(rs) & " -> " & to_hstring(result);
-                end if;
-                r := r(62 downto 0) & '0';
-            end loop;
-        end loop;
-
-	std.env.finish;
-    end process;
-end behave;
--- a/cr_file.vhdl
+++ b/cr_file.vhdl
@ -48,11 +48,11 @@ begin

        crs_updated <= cr_tmp;

-	if w_in.write_xerc_enable = '1' then
-	    xerc_updated <= w_in.write_xerc_data;
-	else
-	    xerc_updated <= xerc;
-	end if;
+        if w_in.write_xerc_enable = '1' then
+            xerc_updated <= w_in.write_xerc_data;
+        else
+            xerc_updated <= xerc;
+        end if;

    end process;

@ -62,12 +62,12 @@ begin
        if rising_edge(clk) then
            if w_in.write_cr_enable = '1' then
                report "Writing " & to_hstring(w_in.write_cr_data) & " to CR mask " & to_hstring(w_in.write_cr_mask);
-		crs <= crs_updated;
+                crs <= crs_updated;
            end if;
-	    if w_in.write_xerc_enable = '1' then
+            if w_in.write_xerc_enable = '1' then
                report "Writing XERC";
-		xerc <= xerc_updated;
-	    end if;
+                xerc <= xerc_updated;
+            end if;
        end if;
    end process;

@ -87,7 +87,7 @@ begin
        begin
            if sim_dump = '1' then
                report "CR 00000000" & to_hstring(crs);
-		assert false report "end of test" severity failure;
+                assert false report "end of test" severity failure;
            end if;
        end process;
    end generate;
--- a/cr_hazard.vhdl
+++ b/cr_hazard.vhdl
@ -1,86 +0,0 @@
-library ieee;
-use ieee.std_logic_1164.all;
-use ieee.numeric_std.all;
-
-entity cr_hazard is
-    generic (
-        PIPELINE_DEPTH : natural := 1
-        );
-    port(
-        clk         : in std_ulogic;
-        busy_in     : in std_ulogic;
-        deferred    : in std_ulogic;
-        complete_in : in std_ulogic;
-        flush_in    : in std_ulogic;
-        issuing     : in std_ulogic;
-
-        cr_read_in  : in std_ulogic;
-        cr_write_in : in std_ulogic;
-        bypassable  : in std_ulogic;
-
-        stall_out   : out std_ulogic;
-        use_bypass  : out std_ulogic
-        );
-end entity cr_hazard;
-architecture behaviour of cr_hazard is
-    type pipeline_entry_type is record
-        valid  : std_ulogic;
-        bypass : std_ulogic;
-    end record;
-    constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0');
-
-    type pipeline_t is array(0 to PIPELINE_DEPTH) of pipeline_entry_type;
-    constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init);
-
-    signal r, rin : pipeline_t := pipeline_t_init;
-begin
-    cr_hazard0: process(clk)
-    begin
-        if rising_edge(clk) then
-            r <= rin;
-        end if;
-    end process;
-
-    cr_hazard1: process(all)
-        variable v     : pipeline_t;
-    begin
-        v := r;
-
-        -- XXX assumes PIPELINE_DEPTH = 1
-        if complete_in = '1' then
-            v(1).valid := '0';
-        end if;
-
-        use_bypass <= '0';
-        stall_out <= '0';
-        if cr_read_in = '1' then
-            loop_0: for i in 0 to PIPELINE_DEPTH loop
-                if v(i).valid = '1' then
-                    if r(i).bypass = '1' then
-                        use_bypass <= '1';
-                    else
-                        stall_out <= '1';
-                    end if;
-                end if;
-            end loop;
-        end if;
-
-        -- XXX assumes PIPELINE_DEPTH = 1
-        if busy_in = '0' then
-            v(1) := r(0);
-            v(0).valid := '0';
-        end if;
-        if deferred = '0' and issuing = '1' then
-            v(0).valid := cr_write_in;
-            v(0).bypass := bypassable;
-        end if;
-        if flush_in = '1' then
-            v(0).valid := '0';
-            v(1).valid := '0';
-        end if;
-
-        -- update registers
-        rin <= v;
-
-    end process;
-end;
--- a/dcache.vhdl
+++ b/dcache.vhdl
@ -1,12 +1,6 @@
 --
 -- Set associative dcache write-through
 --
-- TODO (in no specific order):
--
-- * See list in icache.vhdl
-- * Complete load misses on the cycle when WB data comes instead of
--   at the end of line (this requires dealing with requests coming in
--   while not idle...)
 --
 library ieee;
 use ieee.std_logic_1164.all;
@ -45,11 +39,15 @@ entity dcache is
        m_in         : in MmuToDcacheType;
        m_out        : out DcacheToMmuType;

+        snoop_in     : in wishbone_master_out := wishbone_master_out_init;
+
 	stall_out    : out std_ulogic;

        wishbone_out : out wishbone_master_out;
        wishbone_in  : in wishbone_slave_out;

+        events       : out DcacheEventType;
+
        log_out      : out std_ulogic_vector(19 downto 0)
        );
 end entity dcache;
@ -57,7 +55,7 @@ end entity dcache;
 architecture rtl of dcache is
    -- BRAM organisation: We never access more than wishbone_data_bits at
    -- a time so to save resources we make the array only that wide, and
-    -- use consecutive indices for to make a cache "line"
+    -- use consecutive indices to make a cache "line"
    --
    -- ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
    constant ROW_SIZE      : natural := wishbone_data_bits / 8;
@ -69,8 +67,6 @@ architecture rtl of dcache is

    -- Bit fields counts in the address

-    -- REAL_ADDR_BITS is the number of real address bits that we store
-    constant REAL_ADDR_BITS : positive := 56;
    -- ROW_BITS is the number of bits to select a row 
    constant ROW_BITS      : natural := log2(BRAM_ROWS);
    -- ROW_LINEBITS is the number of bits to select a row within a line
@ -124,7 +120,7 @@ architecture rtl of dcache is
    type cache_valids_t is array(index_t) of cache_way_valids_t;
    type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;

-    -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
+    -- Storage. Hopefully implemented in LUTs
    signal cache_tags    : cache_tags_array_t;
    signal cache_tag_set : cache_tags_set_t;
    signal cache_valids  : cache_valids_t;
@ -206,21 +202,82 @@ architecture rtl of dcache is
    -- which means that the BRAM output is delayed by an extra cycle.
    --
    -- Thus, the dcache has a 2-stage internal pipeline for cache hits
-    -- with no stalls.
+    -- with no stalls.  Stores also complete in 2 cycles in most
+    -- circumstances.
+    --
+    -- A request proceeds through the pipeline as follows.
+    --
+    -- Cycle 0: Request is received from loadstore or mmu if either
+    -- d_in.valid or m_in.valid is 1 (not both).  In this cycle portions
+    -- of the address are presented to the TLB tag RAM and data RAM
+    -- and the cache tag RAM and data RAM.
+    --
+    -- Clock edge between cycle 0 and cycle 1:
+    -- Request is stored in r0 (assuming r0_full was 0).  TLB tag and
+    -- data RAMs are read, and the cache tag RAM is read.  (Cache data
+    -- comes out a cycle later due to its output register, giving the
+    -- whole of cycle 1 to read the cache data RAM.)
    --
-    -- All other operations are handled via stalling in the first stage.
+    -- Cycle 1: TLB and cache tag matching is done, the real address
+    -- (RA) for the access is calculated, and the type of operation is
+    -- determined (the OP_* values above).  This gives the TLB way for
+    -- a TLB hit, and the cache way for a hit or the way to replace
+    -- for a load miss.
    --
-    -- The second stage can thus complete a hit at the same time as the
-    -- first stage emits a stall for a complex op.
+    -- Clock edge between cycle 1 and cycle 2:
+    -- Request is stored in r1 (assuming r1.full was 0)
+    -- The state machine transitions out of IDLE state for a load miss,
+    -- a store, a dcbz, or a non-cacheable load.  r1.full is set to 1
+    -- for a load miss, dcbz or non-cacheable load but not a store.
    --
+    -- Cycle 2: Completion signals are asserted for a load hit,
+    -- a store (excluding dcbz), a TLB operation, a conditional
+    -- store which failed due to no matching reservation, or an error
+    -- (cache hit on non-cacheable operation, TLB miss, or protection
+    -- fault).
+    --
+    -- For a load miss, store, or dcbz, the state machine initiates
+    -- a wishbone cycle, which takes at least 2 cycles.  For a store,
+    -- if another store comes in with the same cache tag (therefore
+    -- in the same 4k page), it can be added on to the existing cycle,
+    -- subject to some constraints.
+    -- While r1.full = 1, no new requests can go from r0 to r1, but
+    -- requests can come in to r0 and be satisfied if they are
+    -- cacheable load hits or stores with the same cache tag.
+    --
+    -- Writing to the cache data RAM is done at the clock edge
+    -- at the end of cycle 2 for a store hit (excluding dcbz).
+    -- Stores that miss are not written to the cache data RAM
+    -- but just stored through to memory.
+    -- Dcbz is done like a cache miss, but the wishbone cycle
+    -- is a write rather than a read, and zeroes are written to
+    -- the cache data RAM.  Thus dcbz will allocate the line in
+    -- the cache as well as zeroing memory.
+    --
+    -- Since stores are written to the cache data RAM at the end of
+    -- cycle 2, and loads can come in and hit on the data just stored,
+    -- there is a two-stage bypass from store data to load data to
+    -- make sure that loads always see previously-stored data even
+    -- if it has not yet made it to the cache data RAM.
+    --
+    -- Load misses read the requested dword of the cache line first in
+    -- the memory read request and then cycle around through the other
+    -- dwords.  The load is completed on the cycle after the requested
+    -- dword comes back from memory (using a forwarding path, rather
+    -- than going via the cache data RAM).  We maintain an array of
+    -- valid bits per dword for the line being refilled so that
+    -- subsequent load requests to the same line can be completed as
+    -- soon as the necessary data comes in from memory, without
+    -- waiting for the whole line to be read.

    -- Stage 0 register, basically contains just the latched request
    type reg_stage_0_t is record
        req   : Loadstore1ToDcacheType;
-        tlbie : std_ulogic;
-        doall : std_ulogic;
-        tlbld : std_ulogic;
+        tlbie : std_ulogic;     -- indicates a tlbie request (from MMU)
+        doall : std_ulogic;     -- with tlbie, indicates flush whole TLB
+        tlbld : std_ulogic;     -- indicates a TLB load request (from MMU)
        mmu_req : std_ulogic;   -- indicates source of request
+        d_valid : std_ulogic;   -- indicates req.data is valid now
    end record;

    signal r0 : reg_stage_0_t;
@ -230,7 +287,7 @@ architecture rtl of dcache is
        op        : op_t;
        valid     : std_ulogic;
        dcbz      : std_ulogic;
-        real_addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0);
+        real_addr : real_addr_t;
        data      : std_ulogic_vector(63 downto 0);
        byte_sel  : std_ulogic_vector(7 downto 0);
        hit_way   : way_t;
@ -258,15 +315,13 @@ architecture rtl of dcache is
        tlb_hit_way      : tlb_way_t;
        tlb_hit_index    : tlb_index_t;

-	-- 2-stage data buffer for data forwarded from writes to reads
-	forward_data1    : std_ulogic_vector(63 downto 0);
-	forward_data2    : std_ulogic_vector(63 downto 0);
-        forward_sel1     : std_ulogic_vector(7 downto 0);
-	forward_valid1   : std_ulogic;
-        forward_way1     : way_t;
-        forward_row1     : row_t;
-        use_forward1     : std_ulogic;
+	-- data buffer for data forwarded from writes to reads
+	forward_data     : std_ulogic_vector(63 downto 0);
+        forward_tag      : cache_tag_t;
        forward_sel      : std_ulogic_vector(7 downto 0);
+	forward_valid    : std_ulogic;
+        forward_row      : row_t;
+        data_out         : std_ulogic_vector(63 downto 0);

 	-- Cache miss state (reload state machine)
        state            : state_t;
@ -298,6 +353,8 @@ architecture rtl of dcache is

    signal r1 : reg_stage_1_t;

+    signal ev : DcacheEventType;
+
    -- Reservation information
    --
    type reservation_t is record
@ -326,12 +383,16 @@ architecture rtl of dcache is
    signal r0_valid   : std_ulogic;
    signal r0_stall   : std_ulogic;

-    signal use_forward1_next : std_ulogic;
-    signal use_forward2_next : std_ulogic;
+    signal fwd_same_tag : std_ulogic;
+    signal use_forward_st : std_ulogic;
+    signal use_forward_rl : std_ulogic;
+    signal use_forward2 : std_ulogic;

    -- Cache RAM interface
    type cache_ram_out_t is array(way_t) of cache_row_t;
    signal cache_out   : cache_ram_out_t;
+    signal ram_wr_data : cache_row_t;
+    signal ram_wr_select : std_ulogic_vector(ROW_SIZE - 1 downto 0);

    -- PLRU output interface
    type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0);
@ -349,17 +410,23 @@ architecture rtl of dcache is
    signal tlb_hit : std_ulogic;
    signal tlb_hit_way : tlb_way_t;
    signal pte : tlb_pte_t;
-    signal ra : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0);
+    signal ra : real_addr_t;
    signal valid_ra : std_ulogic;
    signal perm_attr : perm_attr_t;
    signal rc_ok : std_ulogic;
    signal perm_ok : std_ulogic;
    signal access_ok : std_ulogic;
+    signal tlb_miss : std_ulogic;

    -- TLB PLRU output interface
    type tlb_plru_out_t is array(tlb_index_t) of std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
    signal tlb_plru_victim : tlb_plru_out_t;

+    signal snoop_tag_set : cache_tags_set_t;
+    signal snoop_valid   : std_ulogic;
+    signal snoop_wrtag   : cache_tag_t;
+    signal snoop_index   : index_t;
+
    --
    -- Helper functions to decode incoming requests
    --
@ -385,9 +452,9 @@ architecture rtl of dcache is
    end;

    -- Returns whether this is the last row of a line
-    function is_last_row_addr(addr: wishbone_addr_type; last: row_in_line_t) return boolean is
+    function is_last_row_wb_addr(addr: wishbone_addr_type; last: row_in_line_t) return boolean is
    begin
-	return unsigned(addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)) = last;
+	return unsigned(addr(LINE_OFF_BITS - ROW_OFF_BITS - 1 downto 0)) = last;
    end;

    -- Returns whether this is the last row of a line
@ -397,15 +464,15 @@ architecture rtl of dcache is
    end;

    -- Return the address of the next row in the current cache line
-    function next_row_addr(addr: wishbone_addr_type) return std_ulogic_vector is
+    function next_row_wb_addr(addr: wishbone_addr_type) return std_ulogic_vector is
 	variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
 	variable result  : wishbone_addr_type;
    begin
 	-- Is there no simpler way in VHDL to generate that 3 bits adder ?
-	row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS);
+	row_idx := addr(ROW_LINEBITS - 1 downto 0);
 	row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
 	result := addr;
-	result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
+	result(ROW_LINEBITS - 1 downto 0) := row_idx;
 	return result;
    end;

@ -473,7 +540,8 @@ begin
    assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE;
    assert ispow2(LINE_SIZE)    report "LINE_SIZE not power of 2" severity FAILURE;
    assert ispow2(NUM_LINES)    report "NUM_LINES not power of 2" severity FAILURE;
-    assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2" severity FAILURE;
+    assert ispow2(ROW_PER_LINE) and ROW_PER_LINE > 1
+        report "ROW_PER_LINE not power of 2 greater than 1" severity FAILURE;
    assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
 	report "geometry bits don't add up" severity FAILURE;
    assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
@ -508,18 +576,27 @@ begin
                r.doall := m_in.doall;
                r.tlbld := m_in.tlbld;
                r.mmu_req := '1';
+                r.d_valid := '1';
            else
                r.req := d_in;
+                r.req.data := (others => '0');
                r.tlbie := '0';
                r.doall := '0';
                r.tlbld := '0';
                r.mmu_req := '0';
+                r.d_valid := '0';
            end if;
            if rst = '1' then
                r0_full <= '0';
-            elsif r1.full = '0' or r0_full = '0' then
+            elsif (r1.full = '0' and d_in.hold = '0') or r0_full = '0' then
                r0 <= r;
                r0_full <= r.req.valid;
+            elsif r0.d_valid = '0' then
+                -- Sample data the cycle after a request comes in from loadstore1.
+                -- If this request is already moving into r1 then the data will get
+                -- put directly into req.data in the dcache_slow process below.
+                r0.req.data <= d_in.data;
+                r0.d_valid <= r0.req.valid;
            end if;
        end if;
    end process;
@ -528,10 +605,12 @@ begin
    m_out.stall <= '0';

    -- Hold off the request in r0 when r1 has an uncompleted request
-    r0_stall <= r0_full and r1.full;
-    r0_valid <= r0_full and not r1.full;
+    r0_stall <= r0_full and (r1.full or d_in.hold);
+    r0_valid <= r0_full and not r1.full and not d_in.hold;
    stall_out <= r0_stall;

+    events <= ev;
+
    -- TLB
    -- Operates in the second cycle on the request latched in r0.req.
    -- TLB updates write the entry at the end of the second cycle.
@ -616,6 +695,7 @@ begin
            pte <= (others => '0');
        end if;
        valid_ra <= tlb_hit or not r0.req.virt_mode;
+        tlb_miss <= r0_valid and r0.req.virt_mode and not tlb_hit;
        if r0.req.virt_mode = '1' then
            ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
                  r0.req.addr(TLB_LG_PGSZ - 1 downto ROW_OFF_BITS) &
@ -639,6 +719,7 @@ begin
        if rising_edge(clk) then
            tlbie := r0_valid and r0.tlbie;
            tlbwe := r0_valid and r0.tlbld;
+            ev.dtlb_miss_resolved <= tlbwe;
            if rst = '1' or (tlbie = '1' and r0.doall = '1') then
                -- clear all valid bits at once
                for i in tlb_index_t loop
@ -718,6 +799,23 @@ begin
        end if;
    end process;

+    -- Cache tag RAM second read port, for snooping
+    cache_tag_read_2 : process(clk)
+        variable addr : real_addr_t;
+    begin
+        if rising_edge(clk) then
+            addr := addr_to_real(wb_to_addr(snoop_in.adr));
+            snoop_tag_set <= cache_tags(get_index(addr));
+            snoop_wrtag <= get_tag(addr);
+            snoop_index <= get_index(addr);
+            -- Don't snoop our own cycles
+            snoop_valid <= '0';
+            if not (r1.wb.cyc = '1' and wishbone_in.stall = '0') then
+                snoop_valid <= snoop_in.cyc and snoop_in.stb and snoop_in.we;
+            end if;
+        end if;
+    end process;
+
    -- Cache request parsing and hit detection
    dcache_request : process(all)
        variable is_hit      : std_ulogic;
@ -729,11 +827,13 @@ begin
        variable s_hit       : std_ulogic;
        variable s_tag       : cache_tag_t;
        variable s_pte       : tlb_pte_t;
-        variable s_ra        : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0);
+        variable s_ra        : real_addr_t;
        variable hit_set     : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0);
        variable hit_way_set : hit_way_set_t;
        variable rel_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0);
        variable rel_match   : std_ulogic;
+        variable fwd_matches : std_ulogic_vector(TLB_NUM_WAYS - 1 downto 0);
+        variable fwd_match   : std_ulogic;
    begin
 	-- Extract line, row and tag from request
        req_index <= get_index(r0.req.addr);
@ -749,8 +849,10 @@ begin
        hit_way := 0;
        is_hit := '0';
        rel_match := '0';
+        fwd_match := '0';
        if r0.req.virt_mode = '1' then
            rel_matches := (others => '0');
+            fwd_matches := (others => '0');
            for j in tlb_way_t loop
                hit_way_set(j) := 0;
                s_hit := '0';
@ -770,11 +872,15 @@ begin
                if s_tag = r1.reload_tag then
                    rel_matches(j) := '1';
                end if;
+                if s_tag = r1.forward_tag then
+                    fwd_matches(j) := '1';
+                end if;
            end loop;
            if tlb_hit = '1' then
                is_hit := hit_set(tlb_hit_way);
                hit_way := hit_way_set(tlb_hit_way);
                rel_match := rel_matches(tlb_hit_way);
+                fwd_match := fwd_matches(tlb_hit_way);
            end if;
        else
            s_tag := get_tag(r0.req.addr);
@ -788,39 +894,28 @@ begin
            if s_tag = r1.reload_tag then
                rel_match := '1';
            end if;
+            if s_tag = r1.forward_tag then
+                fwd_match := '1';
+            end if;
        end if;
        req_same_tag <= rel_match;
-
-        -- See if the request matches the line currently being reloaded
-        if r1.state = RELOAD_WAIT_ACK and req_index = r1.store_index and
-            rel_match = '1' then
-            -- For a store, consider this a hit even if the row isn't valid
-            -- since it will be by the time we perform the store.
-            -- For a load, check the appropriate row valid bit.
-            is_hit := not r0.req.load or r1.rows_valid(req_row mod ROW_PER_LINE);
-            hit_way := replace_way;
-        end if;
+        fwd_same_tag <= fwd_match;

        -- Whether to use forwarded data for a load or not
-        use_forward1_next <= '0';
-        if get_row(r1.req.real_addr) = req_row and r1.req.hit_way = hit_way then
-            -- Only need to consider r1.write_bram here, since if we are
-            -- writing refill data here, then we don't have a cache hit this
-            -- cycle on the line being refilled.  (There is the possibility
-            -- that the load following the load miss that started the refill
-            -- could be to the old contents of the victim line, since it is a
-            -- couple of cycles after the refill starts before we see the
-            -- updated cache tag.  In that case we don't use the bypass.)
-            use_forward1_next <= r1.write_bram;
+        use_forward_st <= '0';
+        use_forward_rl <= '0';
+        if r1.store_row = req_row and rel_match = '1' then
+            -- Use the forwarding path if this cycle is a write to this row
+            use_forward_st <= r1.write_bram;
+            if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' then
+                use_forward_rl <= '1';
+            end if;
        end if;
-        use_forward2_next <= '0';
-        if r1.forward_row1 = req_row and r1.forward_way1 = hit_way then
-            use_forward2_next <= r1.forward_valid1;
+        use_forward2 <= '0';
+        if r1.forward_row = req_row and fwd_match = '1' then
+            use_forward2 <= r1.forward_valid;
        end if;

-	-- The way that matched on a hit	       
-	req_hit_way <= hit_way;
-
        -- The way to replace on a miss
        if r1.write_tag = '1' then
            replace_way <= to_integer(unsigned(plru_victim(r1.store_index)));
@ -828,6 +923,23 @@ begin
            replace_way <= r1.store_way;
        end if;

+        -- See if the request matches the line currently being reloaded
+        if r1.state = RELOAD_WAIT_ACK and req_index = r1.store_index and
+            rel_match = '1' then
+            -- Ignore is_hit from above, because a load miss writes the new tag
+            -- but doesn't clear the valid bit on the line before refilling it.
+            -- For a store, consider this a hit even if the row isn't valid
+            -- since it will be by the time we perform the store.
+            -- For a load, check the appropriate row valid bit; but also,
+            -- if use_forward_rl is 1 then we can consider this a hit.
+            is_hit := not r0.req.load or r1.rows_valid(req_row mod ROW_PER_LINE) or
+                      use_forward_rl;
+            hit_way := replace_way;
+        end if;
+
+	-- The way that matched on a hit	       
+	req_hit_way <= hit_way;
+
        -- work out whether we have permission for this access
        -- NB we don't yet implement AMR, thus no KUAP
        rc_ok <= perm_attr.reference and (r0.req.load or perm_attr.changed);
@ -892,10 +1004,10 @@ begin
            -- XXX or if r0.req.nc = '1'
            if r0.req.load = '1' then
                -- load with reservation
-                set_rsrv <= '1';
+                set_rsrv <= r0.req.atomic_last;
            else
                -- store conditional
-                clear_rsrv <= '1';
+                clear_rsrv <= r0.req.atomic_last;
                if reservation.valid = '0' or
                    r0.req.addr(63 downto LINE_OFF_BITS) /= reservation.addr then
                    cancel_store <= '1';
@ -923,28 +1035,9 @@ begin
    -- Return data for loads & completion control logic
    --
    writeback_control: process(all)
-        variable data_out : std_ulogic_vector(63 downto 0);
-        variable data_fwd : std_ulogic_vector(63 downto 0);
-        variable j        : integer;
    begin
-        -- Use the bypass if are reading the row that was written 1 or 2 cycles
-        -- ago, including for the slow_valid = 1 case (i.e. completing a load
-        -- miss or a non-cacheable load).
-        if r1.use_forward1 = '1' then
-            data_fwd := r1.forward_data1;
-        else
-            data_fwd := r1.forward_data2;
-        end if;
-        data_out := cache_out(r1.hit_way);
-        for i in 0 to 7 loop
-            j := i * 8;
-            if r1.forward_sel(i) = '1' then
-                data_out(j + 7 downto j) := data_fwd(j + 7 downto j);
-            end if;
-        end loop;
-
 	d_out.valid <= r1.ls_valid;
-	d_out.data <= data_out;
+	d_out.data <= r1.data_out;
        d_out.store_done <= not r1.stcx_fail;
        d_out.error <= r1.ls_error;
        d_out.cache_paradox <= r1.cache_paradox;
@ -952,7 +1045,7 @@ begin
        -- Outputs to MMU
        m_out.done <= r1.mmu_done;
        m_out.err <= r1.mmu_error;
-        m_out.data <= data_out;
+        m_out.data <= r1.data_out;

 	-- We have a valid load or store hit or we just completed a slow
 	-- op such as a load miss, a NC load or a store
@ -976,7 +1069,7 @@ begin
            -- Request came from loadstore1...
            -- Load hit case is the standard path
            if r1.hit_load_valid = '1' then
-                report "completing load hit data=" & to_hstring(data_out);
+                report "completing load hit data=" & to_hstring(r1.data_out);
            end if;

            -- error cases complete without stalling
@ -986,7 +1079,7 @@ begin

            -- Slow ops (load miss, NC, stores)
            if r1.slow_valid = '1' then
-                report "completing store or load miss data=" & to_hstring(data_out);
+                report "completing store or load miss data=" & to_hstring(r1.data_out);
            end if;

        else
@ -1008,6 +1101,13 @@ begin

    end process;

+    -- RAM write data and select multiplexers
+    ram_wr_data <= r1.req.data when r1.write_bram = '1' else
+                   wishbone_in.dat when r1.dcbz = '0' else
+                   (others => '0');
+    ram_wr_select <= r1.req.byte_sel when r1.write_bram = '1' else
+                     (others => '1');
+
    --
    -- Generate a cache RAM for each way. This handles the normal
    -- reads, writes from reloads and the special store-hit update
@ -1021,7 +1121,6 @@ begin
    rams: for i in 0 to NUM_WAYS-1 generate
 	signal do_read  : std_ulogic;
 	signal rd_addr  : std_ulogic_vector(ROW_BITS-1 downto 0);
-	signal do_write : std_ulogic;
 	signal wr_addr  : std_ulogic_vector(ROW_BITS-1 downto 0);
 	signal wr_data  : std_ulogic_vector(wishbone_data_bits-1 downto 0);
 	signal wr_sel   : std_ulogic_vector(ROW_SIZE-1 downto 0);
@ -1032,7 +1131,7 @@ begin
 	    generic map (
 		ROW_BITS => ROW_BITS,
 		WIDTH => wishbone_data_bits,
-		ADD_BUF => true
+		ADD_BUF => false
 		)
 	    port map (
 		clk     => clk,
@ -1041,7 +1140,7 @@ begin
 		rd_data => dout,
 		wr_sel  => wr_sel_m,
 		wr_addr => wr_addr,
-		wr_data => wr_data
+		wr_data => ram_wr_data
 		);
 	process(all)
 	begin
@ -1057,37 +1156,13 @@ begin
 	    -- For timing, the mux on wr_data/sel/addr is not dependent on anything
 	    -- other than the current state.
 	    --
-            wr_sel_m <= (others => '0');
-
-	    do_write <= '0';
-            if r1.write_bram = '1' then
-                -- Write store data to BRAM.  This happens one cycle after the
-                -- store is in r0.
-                wr_data <= r1.req.data;
-                wr_sel  <= r1.req.byte_sel;
-                wr_addr <= std_ulogic_vector(to_unsigned(get_row(r1.req.real_addr), ROW_BITS));
-                if i = r1.req.hit_way then
-                    do_write <= '1';
-                end if;
-	    else
-		-- Otherwise, we might be doing a reload or a DCBZ
-                if r1.dcbz = '1' then
-                    wr_data <= (others => '0');
-                else
-                    wr_data <= wishbone_in.dat;
-                end if;
-                wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS));
-                wr_sel <= (others => '1');
-
-                if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and replace_way = i then
-                    do_write <= '1';
-                end if;
-	    end if;
+            wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS));

-            -- Mask write selects with do_write since BRAM doesn't
-            -- have a global write-enable
-            if do_write = '1' then
-                wr_sel_m <= wr_sel;
+            wr_sel_m <= (others => '0');
+            if i = replace_way and
+                (r1.write_bram = '1' or
+                 (r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1')) then
+                wr_sel_m <= ram_wr_select;
            end if;

        end process;
@ -1098,20 +1173,60 @@ begin
    -- It also handles error cases (TLB miss, cache paradox)
    --
    dcache_fast_hit : process(clk)
+        variable j        : integer;
+        variable sel      : std_ulogic_vector(1 downto 0);
+        variable data_out : std_ulogic_vector(63 downto 0);
    begin
        if rising_edge(clk) then
            if req_op /= OP_NONE then
-		report "op:" & op_t'image(req_op) &
-		    " addr:" & to_hstring(r0.req.addr) &
-		    " nc:" & std_ulogic'image(r0.req.nc) &
-		    " idx:" & integer'image(req_index) &
-		    " tag:" & to_hstring(req_tag) &
-		    " way: " & integer'image(req_hit_way);
-	    end if;
+                report "op:" & op_t'image(req_op) &
+                    " addr:" & to_hstring(r0.req.addr) &
+                    " nc:" & std_ulogic'image(r0.req.nc) &
+                    " idx:" & integer'image(req_index) &
+                    " tag:" & to_hstring(req_tag) &
+                    " way: " & integer'image(req_hit_way);
+            end if;
            if r0_valid = '1' then
                r1.mmu_req <= r0.mmu_req;
            end if;

+            -- Bypass/forwarding multiplexer for load data.
+            -- Use the bypass if are reading the row of BRAM that was written 0 or 1
+            -- cycles ago, including for the slow_valid = 1 cases (i.e. completing a
+            -- load miss or a non-cacheable load), which are handled via the r1.full case.
+            for i in 0 to 7 loop
+                if r1.full = '1' or use_forward_rl = '1' then
+                    sel := '0' & r1.dcbz;
+                elsif use_forward_st = '1' and r1.req.byte_sel(i) = '1' then
+                    sel := "01";
+                elsif use_forward2 = '1' and r1.forward_sel(i) = '1' then
+                    sel := "10";
+                else
+                    sel := "11";
+                end if;
+                j := i * 8;
+                case sel is
+                    when "00" =>
+                        data_out(j + 7 downto j) := wishbone_in.dat(j + 7 downto j);
+                    when "01" =>
+                        data_out(j + 7 downto j) := r1.req.data(j + 7 downto j);
+                    when "10" =>
+                        data_out(j + 7 downto j) := r1.forward_data(j + 7 downto j);
+                    when others =>
+                        data_out(j + 7 downto j) := cache_out(req_hit_way)(j + 7 downto j);
+                end case;
+            end loop;
+            r1.data_out <= data_out;
+
+            r1.forward_data <= ram_wr_data;
+            r1.forward_tag <= r1.reload_tag;
+            r1.forward_row <= r1.store_row;
+            r1.forward_sel <= ram_wr_select;
+            r1.forward_valid <= r1.write_bram;
+            if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' then
+                r1.forward_valid <= '1';
+            end if;
+
            -- Fast path for load/store hits. Set signals for the writeback controls.
            r1.hit_way <= req_hit_way;
            r1.hit_index <= req_index;
@ -1163,37 +1278,15 @@ begin
    -- operates at stage 1.
    --
    dcache_slow : process(clk)
-	variable stbs_done : boolean;
+        variable stbs_done : boolean;
        variable req       : mem_access_request_t;
        variable acks      : unsigned(2 downto 0);
    begin
        if rising_edge(clk) then
-            r1.use_forward1 <= use_forward1_next;
-            r1.forward_sel <= (others => '0');
-            if use_forward1_next = '1' then
-                r1.forward_sel <= r1.req.byte_sel;
-            elsif use_forward2_next = '1' then
-                r1.forward_sel <= r1.forward_sel1;
-            end if;
-
-            r1.forward_data2 <= r1.forward_data1;
-            if r1.write_bram = '1' then
-                r1.forward_data1 <= r1.req.data;
-                r1.forward_sel1 <= r1.req.byte_sel;
-                r1.forward_way1 <= r1.req.hit_way;
-                r1.forward_row1 <= get_row(r1.req.real_addr);
-                r1.forward_valid1 <= '1';
-            else
-                if r1.dcbz = '1' then
-                    r1.forward_data1 <= (others => '0');
-                else
-                    r1.forward_data1 <= wishbone_in.dat;
-                end if;
-                r1.forward_sel1 <= (others => '1');
-                r1.forward_way1 <= replace_way;
-                r1.forward_row1 <= r1.store_row;
-                r1.forward_valid1 <= '0';
-            end if;
+            ev.dcache_refill <= '0';
+            ev.load_miss <= '0';
+            ev.store_miss <= '0';
+            ev.dtlb_miss <= tlb_miss;

 	    -- On reset, clear all valid bits to force misses
            if rst = '1' then
@ -1228,6 +1321,13 @@ begin
                    end if;
                end if;

+                -- Do invalidations from snooped stores to memory
+                for i in way_t loop
+                    if snoop_valid = '1' and read_tag(i, snoop_tag_set) = snoop_wrtag then
+                        cache_valids(snoop_index)(i) <= '0';
+                    end if;
+                end loop;
+
                if r1.write_tag = '1' then
                    -- Store new tag in selected way
                    for i in 0 to NUM_WAYS-1 loop
@ -1251,13 +1351,15 @@ begin
                    req.dcbz := r0.req.dcbz;
                    req.real_addr := ra;
                    -- Force data to 0 for dcbz
-                    if r0.req.dcbz = '0' then
+                    if r0.req.dcbz = '1' then
+                        req.data := (others => '0');
+                    elsif r0.d_valid = '1' then
                        req.data := r0.req.data;
                    else
-                        req.data := (others => '0');
+                        req.data := d_in.data;
                    end if;
                    -- Select all bytes for dcbz and for cacheable loads
-                    if r0.req.dcbz = '1' or (r0.req.load = '1' and r0.req.nc = '0') then
+                    if r0.req.dcbz = '1' or (r0.req.load = '1' and r0.req.nc = '0' and perm_attr.nocache = '0') then
                        req.byte_sel := (others => '1');
                    else
                        req.byte_sel := r0.req.byte_sel;
@ -1277,7 +1379,7 @@ begin
 		-- Main state machine
 		case r1.state is
                when IDLE =>
-                    r1.wb.adr <= req.real_addr(r1.wb.adr'left downto 0);
+                    r1.wb.adr <= addr_to_wb(req.real_addr);
                    r1.wb.sel <= req.byte_sel;
                    r1.wb.dat <= req.data;
                    r1.dcbz <= req.dcbz;
@ -1317,6 +1419,7 @@ begin
 			-- Track that we had one request sent
 			r1.state <= RELOAD_WAIT_ACK;
                        r1.write_tag <= '1';
+                        ev.load_miss <= '1';

 		    when OP_LOAD_NC =>
                        r1.wb.cyc <= '1';
@ -1349,6 +1452,9 @@ begin
                        r1.wb.we <= '1';
                        r1.wb.cyc <= '1';
                        r1.wb.stb <= '1';
+                        if req.op = OP_STORE_MISS then
+                            ev.store_miss <= '1';
+                        end if;

 		    -- OP_NONE and OP_BAD do nothing
                    -- OP_BAD & OP_STCX_FAIL were handled above already
@ -1358,35 +1464,28 @@ begin
 		    end case;

                when RELOAD_WAIT_ACK =>
-                    -- Requests are all sent if stb is 0
-		    stbs_done := r1.wb.stb = '0';
-
 		    -- If we are still sending requests, was one accepted ?
-		    if wishbone_in.stall = '0' and not stbs_done then
-			-- That was the last word ? We are done sending. Clear
-			-- stb and set stbs_done so we can handle an eventual last
-			-- ack on the same cycle.
-			--
-			if is_last_row_addr(r1.wb.adr, r1.end_row_ix) then
+                    if wishbone_in.stall = '0' and r1.wb.stb = '1' then
+			-- That was the last word ? We are done sending. Clear stb.
+			if is_last_row_wb_addr(r1.wb.adr, r1.end_row_ix) then
 			    r1.wb.stb <= '0';
-			    stbs_done := true;
 			end if;

 			-- Calculate the next row address
-			r1.wb.adr <= next_row_addr(r1.wb.adr);
+			r1.wb.adr <= next_row_wb_addr(r1.wb.adr);
 		    end if;

 		    -- Incoming acks processing
-                    r1.forward_valid1 <= wishbone_in.ack;
 		    if wishbone_in.ack = '1' then
                        r1.rows_valid(r1.store_row mod ROW_PER_LINE) <= '1';
                        -- If this is the data we were looking for, we can
                        -- complete the request next cycle.
                        -- Compare the whole address in case the request in
                        -- r1.req is not the one that started this refill.
+                        -- (Cases where req comes from r0 are handled as a load
+                        -- hit.)
 			if r1.full = '1' and r1.req.same_tag = '1' and
-                            ((r1.dcbz = '1' and r1.req.dcbz = '1') or
-                             (r1.dcbz = '0' and r1.req.op = OP_LOAD_MISS)) and
+                            ((r1.dcbz = '1' and req.dcbz = '1') or r1.req.op = OP_LOAD_MISS) and
                            r1.store_row = get_row(r1.req.real_addr) then
                            r1.full <= '0';
                            r1.slow_valid <= '1';
@ -1395,18 +1494,17 @@ begin
                            else
                                r1.mmu_done <= '1';
                            end if;
-                            r1.forward_sel <= (others => '1');
-                            r1.use_forward1 <= '1';
 			end if;

 			-- Check for completion
-			if stbs_done and is_last_row(r1.store_row, r1.end_row_ix) then
+			if is_last_row(r1.store_row, r1.end_row_ix) then
 			    -- Complete wishbone cycle
 			    r1.wb.cyc <= '0';

 			    -- Cache line is now valid
 			    cache_valids(r1.store_index)(r1.store_way) <= '1';

+                            ev.dcache_refill <= not r1.dcbz;
                            r1.state <= IDLE;
 			end if;

@ -1430,15 +1528,17 @@ begin
                        -- See if there is another store waiting to be done
                        -- which is in the same real page.
                        if req.valid = '1' then
-                            r1.wb.adr(SET_SIZE_BITS - 1 downto 0) <=
-                                req.real_addr(SET_SIZE_BITS - 1 downto 0);
+                            r1.wb.adr(SET_SIZE_BITS - ROW_OFF_BITS - 1 downto 0) <=
+                                req.real_addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS);
                            r1.wb.dat <= req.data;
                            r1.wb.sel <= req.byte_sel;
                        end if;
-                        if acks < 7 and req.same_tag = '1' and
+                        if acks < 7 and req.same_tag = '1' and req.dcbz = '0' and
                            (req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then
                            r1.wb.stb <= '1';
                            stbs_done := false;
+                            r1.store_way <= req.hit_way;
+                            r1.store_row <= get_row(req.real_addr);
                            if req.op = OP_STORE_HIT then
                                r1.write_bram <= '1';
                            end if;
@ -1480,8 +1580,6 @@ begin
                        else
                            r1.mmu_done <= '1';
                        end if;
-                        r1.forward_sel <= (others => '1');
-                        r1.use_forward1 <= '1';
 			r1.wb.cyc <= '0';
 			r1.wb.stb <= '0';
 		    end if;
@ -1496,7 +1594,7 @@ begin
        dcache_log: process(clk)
        begin
            if rising_edge(clk) then
-                log_data <= r1.wb.adr(5 downto 3) &
+                log_data <= r1.wb.adr(2 downto 0) &
                            wishbone_in.stall &
                            wishbone_in.ack &
                            r1.wb.stb & r1.wb.cyc &
--- a/dcache_tb.vhdl
+++ b/dcache_tb.vhdl
@ -70,69 +70,69 @@ begin

    stim: process
    begin
-	-- Clear stuff
- 	d_in.valid <= '0';
- 	d_in.load <= '0';
- 	d_in.nc <= '0';
- 	d_in.addr <= (others => '0');
- 	d_in.data <= (others => '0');
+        -- Clear stuff
+        d_in.valid <= '0';
+        d_in.load <= '0';
+        d_in.nc <= '0';
+        d_in.addr <= (others => '0');
+        d_in.data <= (others => '0');
        m_in.valid <= '0';
        m_in.addr <= (others => '0');
        m_in.pte <= (others => '0');

        wait for 4*clk_period;
-	wait until rising_edge(clk);
+        wait until rising_edge(clk);

-	-- Cacheable read of address 4
-	d_in.load <= '1';
-	d_in.nc <= '0';
+        -- Cacheable read of address 4
+        d_in.load <= '1';
+        d_in.nc <= '0';
        d_in.addr <= x"0000000000000004";
        d_in.valid <= '1';
-	wait until rising_edge(clk);
+        wait until rising_edge(clk);
        d_in.valid <= '0';

-	wait until rising_edge(clk) and d_out.valid = '1';
+        wait until rising_edge(clk) and d_out.valid = '1';
        assert d_out.data = x"0000000100000000"
-	    report "data @" & to_hstring(d_in.addr) &
-	    "=" & to_hstring(d_out.data) &
-	    " expected 0000000100000000"
-	    severity failure;
+            report "data @" & to_hstring(d_in.addr) &
+            "=" & to_hstring(d_out.data) &
+            " expected 0000000100000000"
+            severity failure;
 --      wait for clk_period;

-	-- Cacheable read of address 30
-	d_in.load <= '1';
-	d_in.nc <= '0';
+        -- Cacheable read of address 30
+        d_in.load <= '1';
+        d_in.nc <= '0';
        d_in.addr <= x"0000000000000030";
        d_in.valid <= '1';
-	wait until rising_edge(clk);
+        wait until rising_edge(clk);
        d_in.valid <= '0';

-	wait until rising_edge(clk) and d_out.valid = '1';
+        wait until rising_edge(clk) and d_out.valid = '1';
        assert d_out.data = x"0000000D0000000C"
-	    report "data @" & to_hstring(d_in.addr) &
-	    "=" & to_hstring(d_out.data) &
-	    " expected 0000000D0000000C"
-	    severity failure;
-
-	-- Non-cacheable read of address 100
-	d_in.load <= '1';
-	d_in.nc <= '1';
+            report "data @" & to_hstring(d_in.addr) &
+            "=" & to_hstring(d_out.data) &
+            " expected 0000000D0000000C"
+            severity failure;
+
+        -- Non-cacheable read of address 100
+        d_in.load <= '1';
+        d_in.nc <= '1';
        d_in.addr <= x"0000000000000100";
        d_in.valid <= '1';
-	wait until rising_edge(clk);
-	d_in.valid <= '0';
-	wait until rising_edge(clk) and d_out.valid = '1';
+        wait until rising_edge(clk);
+        d_in.valid <= '0';
+        wait until rising_edge(clk) and d_out.valid = '1';
        assert d_out.data = x"0000004100000040"
-	    report "data @" & to_hstring(d_in.addr) &
-	    "=" & to_hstring(d_out.data) &
-	    " expected 0000004100000040"
-	    severity failure;
+            report "data @" & to_hstring(d_in.addr) &
+            "=" & to_hstring(d_out.data) &
+            " expected 0000004100000040"
+            severity failure;

-	wait until rising_edge(clk);
-	wait until rising_edge(clk);
-	wait until rising_edge(clk);
-	wait until rising_edge(clk);
+        wait until rising_edge(clk);
+        wait until rising_edge(clk);
+        wait until rising_edge(clk);
+        wait until rising_edge(clk);

-	std.env.finish;
+        std.env.finish;
    end process;
 end;
--- a/decode1.vhdl
+++ b/decode1.vhdl
@ -31,9 +31,10 @@ end entity decode1;
 architecture behaviour of decode1 is
    signal r, rin : Decode1ToDecode2Type;
    signal s      : Decode1ToDecode2Type;
+    signal f, fin : Decode1ToFetch1Type;

    constant illegal_inst : decode_rom_t :=
-        (NONE,   OP_ILLEGAL,   NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0');
+        (NONE, NONE, OP_ILLEGAL,   NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE);

    type reg_internal_t is record
        override : std_ulogic;
@ -47,6 +48,14 @@ architecture behaviour of decode1 is
    signal ri, ri_in : reg_internal_t;
    signal si        : reg_internal_t;

+    type br_predictor_t is record
+        br_nia    : std_ulogic_vector(61 downto 0);
+        br_offset : signed(23 downto 0);
+        predict   : std_ulogic;
+    end record;
+
+    signal br, br_in : br_predictor_t;
+
    subtype major_opcode_t is unsigned(5 downto 0);
    type major_rom_array_t is array(0 to 63) of decode_rom_t;
    type minor_valid_array_t is array(0 to 1023) of std_ulogic;
@ -61,53 +70,54 @@ architecture behaviour of decode1 is
    type op_63_subop_array_1_t is array(0 to 16) of decode_rom_t;

    constant major_decode_rom_array : major_rom_array_t := (
-        --          unit     internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
-        --                        op                                            in   out   A   out  in    out  len        ext                                 pipe
-        12 =>       (ALU,    OP_ADD,       RA,         CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- addic
-        13 =>       (ALU,    OP_ADD,       RA,         CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', ONE,  '0', '0'), -- addic.
-        14 =>       (ALU,    OP_ADD,       RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- addi
-        15 =>       (ALU,    OP_ADD,       RA_OR_ZERO, CONST_SI_HI, NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- addis
-        28 =>       (ALU,    OP_AND,       NONE,       CONST_UI,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE,  '0', '0'), -- andi.
-        29 =>       (ALU,    OP_AND,       NONE,       CONST_UI_HI, RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE,  '0', '0'), -- andis.
-         0 =>       (ALU,    OP_ATTN,      NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- attn
-        18 =>       (ALU,    OP_B,         NONE,       CONST_LI,    NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- b
-        16 =>       (ALU,    OP_BC,        SPR,        CONST_BD,    NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- bc
-        11 =>       (ALU,    OP_CMP,       RA,         CONST_SI,    NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmpi
-        10 =>       (ALU,    OP_CMP,       RA,         CONST_UI,    NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli
-        34 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lbz
-        35 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lbzu
-        50 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfd
-        51 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lfdu
-        48 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfs
-        49 =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsu
-        42 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lha
-        43 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhau
-        40 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhz
-        41 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lhzu
-        32 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwz
-        33 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lwzu
-         7 =>       (ALU,    OP_MUL_L64,   RA,         CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- mulli
-        24 =>       (ALU,    OP_OR,        NONE,       CONST_UI,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ori
-        25 =>       (ALU,    OP_OR,        NONE,       CONST_UI_HI, RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- oris
-        20 =>       (ALU,    OP_RLC,       RA,         CONST_SH32,  RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- rlwimi
-        21 =>       (ALU,    OP_RLC,       NONE,       CONST_SH32,  RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- rlwinm
-        23 =>       (ALU,    OP_RLC,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- rlwnm
-        17 =>       (ALU,    OP_SC,        NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sc
-        38 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stb
-        39 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbu
-        54 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfd
-        55 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stfdu
-        52 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfs
-        53 =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsu
-        44 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sth
-        45 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthu
-        36 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stw
-        37 =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stwu
-         8 =>       (ALU,    OP_ADD,       RA,         CONST_SI,    NONE, RT,   '0', '0', '1', '0', ONE,  '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- subfic
-         2 =>       (ALU,    OP_TRAP,      RA,         CONST_SI,    NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- tdi
-         3 =>       (ALU,    OP_TRAP,      RA,         CONST_SI,    NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1'), -- twi
-        26 =>       (ALU,    OP_XOR,       NONE,       CONST_UI,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- xori
-        27 =>       (ALU,    OP_XOR,       NONE,       CONST_UI_HI, RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- xoris
+        --          unit   fac   internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl  rpt
+        --                            op                                            in   out   A   out  in    out  len        ext                                 pipe
+        12 =>       (ALU,  NONE, OP_ADD,       RA,         CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- addic
+        13 =>       (ALU,  NONE, OP_ADD,       RA,         CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', ONE,  '0', '0', NONE), -- addic.
+        14 =>       (ALU,  NONE, OP_ADD,       RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- addi
+        15 =>       (ALU,  NONE, OP_ADD,       RA_OR_ZERO, CONST_SI_HI, NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- addis
+        28 =>       (ALU,  NONE, OP_AND,       NONE,       CONST_UI,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE,  '0', '0', NONE), -- andi.
+        29 =>       (ALU,  NONE, OP_AND,       NONE,       CONST_UI_HI, RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE,  '0', '0', NONE), -- andis.
+         0 =>       (ALU,  NONE, OP_ATTN,      NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- attn
+        18 =>       (ALU,  NONE, OP_B,         NONE,       CONST_LI,    NONE, SPR,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- b
+        16 =>       (ALU,  NONE, OP_BC,        SPR,        CONST_BD,    NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- bc
+        11 =>       (ALU,  NONE, OP_CMP,       RA,         CONST_SI,    NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- cmpi
+        10 =>       (ALU,  NONE, OP_CMP,       RA,         CONST_UI,    NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpli
+        34 =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lbz
+        35 =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lbzu
+        50 =>       (LDST, FPU,  OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lfd
+        51 =>       (LDST, FPU,  OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lfdu
+        48 =>       (LDST, FPU,  OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- lfs
+        49 =>       (LDST, FPU,  OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0', DUPD), -- lfsu
+        42 =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lha
+        43 =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lhau
+        40 =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhz
+        41 =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lhzu
+        56 =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, CONST_DQ,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', DRTE), -- lq
+        32 =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwz
+        33 =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lwzu
+         7 =>       (ALU,  NONE, OP_MUL_L64,   RA,         CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- mulli
+        24 =>       (ALU,  NONE, OP_OR,        NONE,       CONST_UI,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- ori
+        25 =>       (ALU,  NONE, OP_OR,        NONE,       CONST_UI_HI, RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- oris
+        20 =>       (ALU,  NONE, OP_RLC,       RA,         CONST_SH32,  RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- rlwimi
+        21 =>       (ALU,  NONE, OP_RLC,       NONE,       CONST_SH32,  RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- rlwinm
+        23 =>       (ALU,  NONE, OP_RLC,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- rlwnm
+        17 =>       (ALU,  NONE, OP_SC,        NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- sc
+        38 =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stb
+        39 =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stbu
+        54 =>       (LDST, FPU,  OP_STORE,     RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stfd
+        55 =>       (LDST, FPU,  OP_STORE,     RA_OR_ZERO, CONST_SI,    FRS,  RA,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stfdu
+        52 =>       (LDST, FPU,  OP_STORE,     RA_OR_ZERO, CONST_SI,    FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- stfs
+        53 =>       (LDST, FPU,  OP_STORE,     RA_OR_ZERO, CONST_SI,    FRS,  RA,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0', NONE), -- stfsu
+        44 =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- sth
+        45 =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- sthu
+        36 =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stw
+        37 =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, CONST_SI,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stwu
+         8 =>       (ALU,  NONE, OP_ADD,       RA,         CONST_SI,    NONE, RT,   '0', '0', '1', '0', ONE,  '1', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- subfic
+         2 =>       (ALU,  NONE, OP_TRAP,      RA,         CONST_SI,    NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- tdi
+         3 =>       (ALU,  NONE, OP_TRAP,      RA,         CONST_SI,    NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- twi
+        26 =>       (ALU,  NONE, OP_XOR,       NONE,       CONST_UI,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- xori
+        27 =>       (ALU,  NONE, OP_XOR,       NONE,       CONST_UI_HI, RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- xoris
        others   => illegal_inst
        );

@ -121,11 +131,11 @@ architecture behaviour of decode1 is

    -- indexed by bits 5..0 of instruction word
    constant decode_op_4_array : op_4_subop_array_t := (
-        --                   unit    internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
-        --                                op                                            in   out   A   out  in    out  len        ext                                 pipe
-        2#110000#  =>       (ALU,    OP_MUL_H64,   RA,         RB,          RCR,  RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- maddhd
-        2#110001#  =>       (ALU,    OP_MUL_H64,   RA,         RB,          RCR,  RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- maddhdu
-        2#110011#  =>       (ALU,    OP_MUL_L64,   RA,         RB,          RCR,  RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- maddld
+        --                   unit fac   internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl  rpt
+        --                                   op                                            in   out   A   out  in    out  len        ext                                 pipe
+        2#110000#  =>       (ALU, NONE, OP_MUL_H64,   RA,         RB,          RCR,  RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- maddhd
+        2#110001#  =>       (ALU, NONE, OP_MUL_H64,   RA,         RB,          RCR,  RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- maddhdu
+        2#110011#  =>       (ALU, NONE, OP_MUL_L64,   RA,         RB,          RCR,  RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- maddld
        others   => decode_rom_init
        );

@ -151,356 +161,363 @@ architecture behaviour of decode1 is

    -- indexed by bits 5, 3, 2 of instruction word
    constant decode_op_19_array : op_19_subop_array_t := (
-        --                 unit     internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
-        --                               op                                            in   out   A   out  in    out  len        ext                                 pipe
+        --                 unit  fac   internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl  rpt
+        --                                  op                                            in   out   A   out  in    out  len        ext                                 pipe
        -- mcrf; and cr logical ops
-        2#000#    =>       (ALU,    OP_CROP,      NONE,       NONE,        NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'),
+        2#000#    =>       (ALU, NONE, OP_CROP,      NONE,       NONE,        NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
        -- addpcis
-        2#001#    =>       (ALU,    OP_ADD,       CIA,        CONST_DXHI4, NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'),
+        2#001#    =>       (ALU, NONE, OP_ADD,       CIA,        CONST_DXHI4, NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
        -- bclr, bcctr, bctar
-        2#100#    =>       (ALU,    OP_BCREG,     SPR,        SPR,         NONE, SPR,  '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'),
+        2#100#    =>       (ALU, NONE, OP_BCREG,     SPR,        SPR,         NONE, SPR,  '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE),
        -- isync
-        2#111#    =>       (ALU,    OP_ISYNC,     NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'),
+        2#111#    =>       (ALU, NONE, OP_ISYNC,     NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE),
        -- rfid
-        2#101#    =>       (ALU,    OP_RFID,      SPR,        SPR,         NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'),
+        2#101#    =>       (ALU, NONE, OP_RFID,      SPR,        SPR,         NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
        others   => illegal_inst
        );

    constant decode_op_30_array : op_30_subop_array_t := (
-        --                 unit    internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
-        --                               op                                           in   out   A   out  in    out  len        ext                                pipe
-        2#0100#  =>       (ALU,    OP_RLC,       NONE,       CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- rldic
-        2#0101#  =>       (ALU,    OP_RLC,       NONE,       CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- rldic
-        2#0000#  =>       (ALU,    OP_RLCL,      NONE,       CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- rldicl
-        2#0001#  =>       (ALU,    OP_RLCL,      NONE,       CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- rldicl
-        2#0010#  =>       (ALU,    OP_RLCR,      NONE,       CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- rldicr
-        2#0011#  =>       (ALU,    OP_RLCR,      NONE,       CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- rldicr
-        2#0110#  =>       (ALU,    OP_RLC,       RA,         CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- rldimi
-        2#0111#  =>       (ALU,    OP_RLC,       RA,         CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- rldimi
-        2#1000#  =>       (ALU,    OP_RLCL,      NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- rldcl
-        2#1001#  =>       (ALU,    OP_RLCR,      NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- rldcr
+        --                 unit fac   internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl  rpt
+        --                                  op                                           in   out   A   out  in    out  len        ext                                pipe
+        2#0100#  =>       (ALU, NONE, OP_RLC,       NONE,       CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- rldic
+        2#0101#  =>       (ALU, NONE, OP_RLC,       NONE,       CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- rldic
+        2#0000#  =>       (ALU, NONE, OP_RLCL,      NONE,       CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- rldicl
+        2#0001#  =>       (ALU, NONE, OP_RLCL,      NONE,       CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- rldicl
+        2#0010#  =>       (ALU, NONE, OP_RLCR,      NONE,       CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- rldicr
+        2#0011#  =>       (ALU, NONE, OP_RLCR,      NONE,       CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- rldicr
+        2#0110#  =>       (ALU, NONE, OP_RLC,       RA,         CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- rldimi
+        2#0111#  =>       (ALU, NONE, OP_RLC,       RA,         CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- rldimi
+        2#1000#  =>       (ALU, NONE, OP_RLCL,      NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- rldcl
+        2#1001#  =>       (ALU, NONE, OP_RLCR,      NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- rldcr
        others   => illegal_inst
        );

    -- Note: reformat with column -t -o ' '
    constant decode_op_31_array : op_31_subop_array_t := (
-        --                       unit    internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
-        --                                    op                                            in   out   A   out  in    out  len        ext                                 pipe
-        2#0100001010#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- add
-        2#1100001010#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addo
-        2#0000001010#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addc
-        2#1000001010#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addco
-        2#0010001010#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- adde
-        2#1010001010#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addeo
-        2#0010101010#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', OV,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addex
-        2#0001001010#  =>       (ALU,    OP_ADDG6S,    RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- addg6s
-        2#0011101010#  =>       (ALU,    OP_ADD,       RA,         CONST_M1,    NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addme
-        2#1011101010#  =>       (ALU,    OP_ADD,       RA,         CONST_M1,    NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addmeo
-        2#0011001010#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addze
-        2#1011001010#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addzeo
-        2#0000011100#  =>       (ALU,    OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- and
-        2#0000111100#  =>       (ALU,    OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- andc
-        2#0011111100#  =>       (ALU,    OP_BPERM,     NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- bperm
-        2#0100111010#  =>       (ALU,    OP_BCD,       NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cbcdtd
-        2#0100011010#  =>       (ALU,    OP_BCD,       NONE,       NONE,        RS,   RA,   '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cdtbcd
-        2#0000000000#  =>       (ALU,    OP_CMP,       RA,         RB,          NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmp
-        2#0111111100#  =>       (ALU,    OP_CMPB,      NONE,       RB,          RS,   RA,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpb
-        2#0011100000#  =>       (ALU,    OP_CMPEQB,    RA,         RB,          NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpeqb
-        2#0000100000#  =>       (ALU,    OP_CMP,       RA,         RB,          NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpl
-        2#0011000000#  =>       (ALU,    OP_CMPRB,     RA,         RB,          NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmprb
-        2#0000111010#  =>       (ALU,    OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- cntlzd
-        2#0000011010#  =>       (ALU,    OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- cntlzw
-        2#1000111010#  =>       (ALU,    OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- cnttzd
-        2#1000011010#  =>       (ALU,    OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- cnttzw
-        2#1011110011#  =>       (ALU,    OP_DARN,      NONE,       NONE,        NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- darn
-        2#0001010110#  =>       (ALU,    OP_DCBF,      NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbf
-        2#0000110110#  =>       (ALU,    OP_DCBST,     NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbst
-        2#0100010110#  =>       (ALU,    OP_DCBT,      NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbt
-        2#0011110110#  =>       (ALU,    OP_DCBTST,    NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbtst
-        2#1111110110#  =>       (LDST,   OP_DCBZ,      RA_OR_ZERO, RB,          NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- dcbz
-        2#0110001001#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- divdeu
-        2#1110001001#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- divdeuo
-        2#0110001011#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- divweu
-        2#1110001011#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- divweuo
-        2#0110101001#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- divde
-        2#1110101001#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- divdeo
-        2#0110101011#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- divwe
-        2#1110101011#  =>       (ALU,    OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- divweo
-        2#0111001001#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- divdu
-        2#1111001001#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- divduo
-        2#0111001011#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- divwu
-        2#1111001011#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- divwuo
-        2#0111101001#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- divd
-        2#1111101001#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- divdo
-        2#0111101011#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- divw
-        2#1111101011#  =>       (ALU,    OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- divwo
-        2#1101010110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- eieio
-        2#0100011100#  =>       (ALU,    OP_XOR,       NONE,       RB,          RS,   RA,   '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- eqv
-        2#1110111010#  =>       (ALU,    OP_EXTS,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- extsb
-        2#1110011010#  =>       (ALU,    OP_EXTS,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- extsh
-        2#1111011010#  =>       (ALU,    OP_EXTS,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- extsw
-        2#1101111010#  =>       (ALU,    OP_EXTSWSLI,  NONE,       CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- extswsli
-        2#1101111011#  =>       (ALU,    OP_EXTSWSLI,  NONE,       CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- extswsli
-        2#1111010110#  =>       (ALU,    OP_ICBI,      NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- icbi
-        2#0000010110#  =>       (ALU,    OP_ICBT,      NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- icbt
-        2#0000001111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- isel
-        2#0000101111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#0001001111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#0001101111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#0010001111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#0010101111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#0011001111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#0011101111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#0100001111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#0100101111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#0101001111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#0101101111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#0110001111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#0110101111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#0111001111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#0111101111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#1000001111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#1000101111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#1001001111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#1001101111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#1010001111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#1010101111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#1011001111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#1011101111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#1100001111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#1100101111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#1101001111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#1101101111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#1110001111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#1110101111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#1111001111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#1111101111#  =>       (ALU,    OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- isel
-        2#0000110100#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lbarx
-        2#1101010101#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lbzcix
-        2#0001110111#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lbzux
-        2#0001010111#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lbzx
-        2#0001010100#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- ldarx
-        2#1000010100#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldbrx
-        2#1101110101#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldcix
-        2#0000110101#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- ldux
-        2#0000010101#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldx
-        2#1001010111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfdx
-        2#1001110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lfdux
-        2#1101010111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lfiwax
-        2#1101110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfiwzx
-        2#1000010111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfsx
-        2#1000110111#  =>       (LDST,   OP_FPLOAD,    RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsux
-        2#0001110100#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lharx
-        2#0101110111#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhaux
-        2#0101010111#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lhax
-        2#1100010110#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhbrx
-        2#1100110101#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhzcix
-        2#0100110111#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lhzux
-        2#0100010111#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhzx
-        2#0000010100#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lwarx
-        2#0101110101#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lwaux
-        2#0101010101#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lwax
-        2#1000010110#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwbrx
-        2#1100010101#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwzcix
-        2#0000110111#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lwzux
-        2#0000010111#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwzx
-        2#1001000000#  =>       (ALU,    OP_MCRXRX,    NONE,       NONE,        NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mcrxrx
-        2#0000010011#  =>       (ALU,    OP_MFCR,      NONE,       NONE,        NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfcr/mfocrf
-        2#0001010011#  =>       (ALU,    OP_MFMSR,     NONE,       NONE,        NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- mfmsr
-        2#0101010011#  =>       (ALU,    OP_MFSPR,     SPR,        NONE,        RS,   RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfspr
-        2#0100001001#  =>       (ALU,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- modud
-        2#0100001011#  =>       (ALU,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- moduw
-        2#1100001001#  =>       (ALU,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- modsd
-        2#1100001011#  =>       (ALU,    OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', NONE, '0', '0'), -- modsw
-        2#0010010000#  =>       (ALU,    OP_MTCRF,     NONE,       NONE,        RS,   NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtcrf/mtocrf
-        2#0010010010#  =>       (ALU,    OP_MTMSRD,    NONE,       NONE,        RS,   NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1'), -- mtmsr
-        2#0010110010#  =>       (ALU,    OP_MTMSRD,    NONE,       NONE,        RS,   NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- mtmsrd # ignore top bits and d
-        2#0111010011#  =>       (ALU,    OP_MTSPR,     NONE,       NONE,        RS,   SPR,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mtspr
-        2#0001001001#  =>       (ALU,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- mulhd
-        2#0000001001#  =>       (ALU,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- mulhdu
-        2#0001001011#  =>       (ALU,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- mulhw
-        2#0000001011#  =>       (ALU,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- mulhwu
+        --                       unit  fac   internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl  rpt
+        --                                        op                                            in   out   A   out  in    out  len        ext                                 pipe
+        2#0100001010#  =>       (ALU,  NONE, OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- add
+        2#1100001010#  =>       (ALU,  NONE, OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- addo
+        2#0000001010#  =>       (ALU,  NONE, OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- addc
+        2#1000001010#  =>       (ALU,  NONE, OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- addco
+        2#0010001010#  =>       (ALU,  NONE, OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- adde
+        2#1010001010#  =>       (ALU,  NONE, OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- addeo
+        2#0010101010#  =>       (ALU,  NONE, OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', OV,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- addex
+        2#0001001010#  =>       (ALU,  NONE, OP_ADDG6S,    RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- addg6s
+        2#0011101010#  =>       (ALU,  NONE, OP_ADD,       RA,         CONST_M1,    NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- addme
+        2#1011101010#  =>       (ALU,  NONE, OP_ADD,       RA,         CONST_M1,    NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- addmeo
+        2#0011001010#  =>       (ALU,  NONE, OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- addze
+        2#1011001010#  =>       (ALU,  NONE, OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- addzeo
+        2#0000011100#  =>       (ALU,  NONE, OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- and
+        2#0000111100#  =>       (ALU,  NONE, OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- andc
+        2#0011111100#  =>       (ALU,  NONE, OP_BPERM,     NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- bperm
+        2#0100111010#  =>       (ALU,  NONE, OP_BCD,       NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cbcdtd
+        2#0100011010#  =>       (ALU,  NONE, OP_BCD,       NONE,       NONE,        RS,   RA,   '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cdtbcd
+        2#0000000000#  =>       (ALU,  NONE, OP_CMP,       RA,         RB,          NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- cmp
+        2#0111111100#  =>       (ALU,  NONE, OP_CMPB,      NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpb
+        2#0011100000#  =>       (ALU,  NONE, OP_CMPEQB,    RA,         RB,          NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpeqb
+        2#0000100000#  =>       (ALU,  NONE, OP_CMP,       RA,         RB,          NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpl
+        2#0011000000#  =>       (ALU,  NONE, OP_CMPRB,     RA,         RB,          NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmprb
+        2#0000111010#  =>       (ALU,  NONE, OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- cntlzd
+        2#0000011010#  =>       (ALU,  NONE, OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- cntlzw
+        2#1000111010#  =>       (ALU,  NONE, OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- cnttzd
+        2#1000011010#  =>       (ALU,  NONE, OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- cnttzw
+        2#1011110011#  =>       (ALU,  NONE, OP_DARN,      NONE,       NONE,        NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- darn
+        2#0001010110#  =>       (ALU,  NONE, OP_DCBF,      NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- dcbf
+        2#0000110110#  =>       (ALU,  NONE, OP_DCBST,     NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- dcbst
+        2#0100010110#  =>       (ALU,  NONE, OP_DCBT,      NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- dcbt
+        2#0011110110#  =>       (ALU,  NONE, OP_DCBTST,    NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- dcbtst
+        2#1111110110#  =>       (LDST, NONE, OP_DCBZ,      RA_OR_ZERO, RB,          NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dcbz
+        2#0110001001#  =>       (ALU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- divdeu
+        2#1110001001#  =>       (ALU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- divdeuo
+        2#0110001011#  =>       (ALU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- divweu
+        2#1110001011#  =>       (ALU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- divweuo
+        2#0110101001#  =>       (ALU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0', NONE), -- divde
+        2#1110101001#  =>       (ALU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0', NONE), -- divdeo
+        2#0110101011#  =>       (ALU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0', NONE), -- divwe
+        2#1110101011#  =>       (ALU,  NONE, OP_DIVE,      RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0', NONE), -- divweo
+        2#0111001001#  =>       (ALU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- divdu
+        2#1111001001#  =>       (ALU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- divduo
+        2#0111001011#  =>       (ALU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- divwu
+        2#1111001011#  =>       (ALU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- divwuo
+        2#0111101001#  =>       (ALU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0', NONE), -- divd
+        2#1111101001#  =>       (ALU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0', NONE), -- divdo
+        2#0111101011#  =>       (ALU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0', NONE), -- divw
+        2#1111101011#  =>       (ALU,  NONE, OP_DIV,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0', NONE), -- divwo
+        2#1100110110#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dss
+        2#0101010110#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dst
+        2#0101110110#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dstst
+        2#1101010110#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- eieio
+        2#0100011100#  =>       (ALU,  NONE, OP_XOR,       NONE,       RB,          RS,   RA,   '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- eqv
+        2#1110111010#  =>       (ALU,  NONE, OP_EXTS,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- extsb
+        2#1110011010#  =>       (ALU,  NONE, OP_EXTS,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- extsh
+        2#1111011010#  =>       (ALU,  NONE, OP_EXTS,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- extsw
+        2#1101111010#  =>       (ALU,  NONE, OP_EXTSWSLI,  NONE,       CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- extswsli
+        2#1101111011#  =>       (ALU,  NONE, OP_EXTSWSLI,  NONE,       CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- extswsli
+        2#1111010110#  =>       (ALU,  NONE, OP_ICBI,      NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- icbi
+        2#0000010110#  =>       (ALU,  NONE, OP_ICBT,      NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- icbt
+        2#0000001111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#0000101111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#0001001111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#0001101111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#0010001111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#0010101111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#0011001111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#0011101111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#0100001111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#0100101111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#0101001111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#0101101111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#0110001111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#0110101111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#0111001111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#0111101111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#1000001111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#1000101111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#1001001111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#1001101111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#1010001111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#1010101111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#1011001111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#1011101111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#1100001111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#1100101111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#1101001111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#1101101111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#1110001111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#1110101111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#1111001111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#1111101111#  =>       (ALU,  NONE, OP_ISEL,      RA_OR_ZERO, RB,          NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- isel
+        2#0000110100#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE), -- lbarx
+        2#1101010101#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lbzcix
+        2#0001110111#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lbzux
+        2#0001010111#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lbzx
+        2#0001010100#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE), -- ldarx
+        2#1000010100#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- ldbrx
+        2#1101110101#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- ldcix
+        2#0000110101#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- ldux
+        2#0000010101#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- ldx
+        2#1001010111#  =>       (LDST, FPU,  OP_LOAD,      RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lfdx
+        2#1001110111#  =>       (LDST, FPU,  OP_LOAD,      RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lfdux
+        2#1101010111#  =>       (LDST, FPU,  OP_LOAD,      RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lfiwax
+        2#1101110111#  =>       (LDST, FPU,  OP_LOAD,      RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lfiwzx
+        2#1000010111#  =>       (LDST, FPU,  OP_LOAD,      RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- lfsx
+        2#1000110111#  =>       (LDST, FPU,  OP_LOAD,      RA_OR_ZERO, RB,          NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0', DUPD), -- lfsux
+        2#0001110100#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE), -- lharx
+        2#0101110111#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lhaux
+        2#0101010111#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhax
+        2#1100010110#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhbrx
+        2#1100110101#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhzcix
+        2#0100110111#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lhzux
+        2#0100010111#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhzx
+        2#0100010100#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', DRTE), -- lqarx
+        2#0000010100#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE), -- lwarx
+        2#0101110101#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lwaux
+        2#0101010101#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwax
+        2#1000010110#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwbrx
+        2#1100010101#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwzcix
+        2#0000110111#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lwzux
+        2#0000010111#  =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwzx
+        2#1001000000#  =>       (ALU,  NONE, OP_MCRXRX,    NONE,       NONE,        NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mcrxrx
+        2#0000010011#  =>       (ALU,  NONE, OP_MFCR,      NONE,       NONE,        NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mfcr/mfocrf
+        2#0001010011#  =>       (ALU,  NONE, OP_MFMSR,     NONE,       NONE,        NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- mfmsr
+        2#0101010011#  =>       (ALU,  NONE, OP_MFSPR,     SPR,        NONE,        RS,   RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mfspr
+        2#0100001001#  =>       (ALU,  NONE, OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- modud
+        2#0100001011#  =>       (ALU,  NONE, OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- moduw
+        2#1100001001#  =>       (ALU,  NONE, OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- modsd
+        2#1100001011#  =>       (ALU,  NONE, OP_MOD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', NONE, '0', '0', NONE), -- modsw
+        2#0010010000#  =>       (ALU,  NONE, OP_MTCRF,     NONE,       NONE,        RS,   NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtcrf/mtocrf
+        2#0010010010#  =>       (ALU,  NONE, OP_MTMSRD,    NONE,       NONE,        RS,   NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1', NONE), -- mtmsr
+        2#0010110010#  =>       (ALU,  NONE, OP_MTMSRD,    NONE,       NONE,        RS,   NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- mtmsrd # ignore top bits and d
+        2#0111010011#  =>       (ALU,  NONE, OP_MTSPR,     NONE,       NONE,        RS,   SPR,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtspr
+        2#0001001001#  =>       (ALU,  NONE, OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0', NONE), -- mulhd
+        2#0000001001#  =>       (ALU,  NONE, OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- mulhdu
+        2#0001001011#  =>       (ALU,  NONE, OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0', NONE), -- mulhw
+        2#0000001011#  =>       (ALU,  NONE, OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- mulhwu
        -- next 4 have reserved bit set
-        2#1001001001#  =>       (ALU,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- mulhd
-        2#1000001001#  =>       (ALU,    OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- mulhdu
-        2#1001001011#  =>       (ALU,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- mulhw
-        2#1000001011#  =>       (ALU,    OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- mulhwu
-        2#0011101001#  =>       (ALU,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- mulld
-        2#1011101001#  =>       (ALU,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- mulldo
-        2#0011101011#  =>       (ALU,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- mullw
-        2#1011101011#  =>       (ALU,    OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- mullwo
-        2#0111011100#  =>       (ALU,    OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- nand
-        2#0001101000#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- neg
-        2#1001101000#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- nego
+        2#1001001001#  =>       (ALU,  NONE, OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0', NONE), -- mulhd
+        2#1000001001#  =>       (ALU,  NONE, OP_MUL_H64,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- mulhdu
+        2#1001001011#  =>       (ALU,  NONE, OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0', NONE), -- mulhw
+        2#1000001011#  =>       (ALU,  NONE, OP_MUL_H32,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- mulhwu
+        2#0011101001#  =>       (ALU,  NONE, OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0', NONE), -- mulld
+        2#1011101001#  =>       (ALU,  NONE, OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0', NONE), -- mulldo
+        2#0011101011#  =>       (ALU,  NONE, OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0', NONE), -- mullw
+        2#1011101011#  =>       (ALU,  NONE, OP_MUL_L64,   RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0', NONE), -- mullwo
+        2#0111011100#  =>       (ALU,  NONE, OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- nand
+        2#0001101000#  =>       (ALU,  NONE, OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- neg
+        2#1001101000#  =>       (ALU,  NONE, OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- nego
        -- next 8 are reserved no-op instructions
-        2#1000010010#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop
-        2#1000110010#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop
-        2#1001010010#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop
-        2#1001110010#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop
-        2#1010010010#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop
-        2#1010110010#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop
-        2#1011010010#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop
-        2#1011110010#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop
-        2#0001111100#  =>       (ALU,    OP_OR,        NONE,       RB,          RS,   RA,   '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- nor
-        2#0110111100#  =>       (ALU,    OP_OR,        NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- or
-        2#0110011100#  =>       (ALU,    OP_OR,        NONE,       RB,          RS,   RA,   '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- orc
-        2#0001111010#  =>       (ALU,    OP_POPCNT,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntb
-        2#0111111010#  =>       (ALU,    OP_POPCNT,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntd
-        2#0101111010#  =>       (ALU,    OP_POPCNT,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntw
-        2#0010111010#  =>       (ALU,    OP_PRTY,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyd
-        2#0010011010#  =>       (ALU,    OP_PRTY,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyw
-        2#0010000000#  =>       (ALU,    OP_SETB,      NONE,       NONE,        NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- setb
-        2#0111110010#  =>       (LDST,   OP_TLBIE,     NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- slbia
-        2#0000011011#  =>       (ALU,    OP_SHL,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- sld
-        2#0000011000#  =>       (ALU,    OP_SHL,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- slw
-        2#1100011010#  =>       (ALU,    OP_SHR,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- srad
-        2#1100111010#  =>       (ALU,    OP_SHR,       NONE,       CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- sradi
-        2#1100111011#  =>       (ALU,    OP_SHR,       NONE,       CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- sradi
-        2#1100011000#  =>       (ALU,    OP_SHR,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- sraw
-        2#1100111000#  =>       (ALU,    OP_SHR,       NONE,       CONST_SH32,  RS,   RA,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0'), -- srawi
-        2#1000011011#  =>       (ALU,    OP_SHR,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- srd
-        2#1000011000#  =>       (ALU,    OP_SHR,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- srw
-        2#1111010101#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stbcix
-        2#1010110110#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', ONE,  '0', '0'), -- stbcx
-        2#0011110111#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbux
-        2#0011010111#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stbx
-        2#1010010100#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdbrx
-        2#1111110101#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdcix
-        2#0011010110#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE,  '0', '0'), -- stdcx
-        2#0010110101#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stdux
-        2#0010010101#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdx
-        2#1011010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfdx
-        2#1011110111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stfdux
-        2#1111010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfiwx
-        2#1010010111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfsx
-        2#1010110111#  =>       (LDST,   OP_FPSTORE,   RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsux
-        2#1110010110#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthbrx
-        2#1110110101#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthcix
-        2#1011010110#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE,  '0', '0'), -- sthcx
-        2#0110110111#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthux
-        2#0110010111#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthx
-        2#1010010110#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stwbrx
-        2#1110010101#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stwcix
-        2#0010010110#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', ONE,  '0', '0'), -- stwcx
-        2#0010110111#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stwux
-        2#0010010111#  =>       (LDST,   OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stwx
-        2#0000101000#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subf
-        2#1000101000#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subfo
-        2#0000001000#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', ONE,  '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subfc
-        2#1000001000#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', ONE,  '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subfco
-        2#0010001000#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subfe
-        2#1010001000#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subfeo
-        2#0011101000#  =>       (ALU,    OP_ADD,       RA,         CONST_M1,    NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subfme
-        2#1011101000#  =>       (ALU,    OP_ADD,       RA,         CONST_M1,    NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subfmeo
-        2#0011001000#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subfze
-        2#1011001000#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- subfzeo
-        2#1001010110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- sync
-        2#0001000100#  =>       (ALU,    OP_TRAP,      RA,         RB,          NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- td
-        2#0000000100#  =>       (ALU,    OP_TRAP,      RA,         RB,          NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1'), -- tw
-        2#0100110010#  =>       (LDST,   OP_TLBIE,     NONE,       RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- tlbie
-        2#0100010010#  =>       (LDST,   OP_TLBIE,     NONE,       RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- tlbiel
-        2#0000011110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- wait
-        2#0100111100#  =>       (ALU,    OP_XOR,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- xor
+        2#1000010010#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- nop
+        2#1000110010#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- nop
+        2#1001010010#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- nop
+        2#1001110010#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- nop
+        2#1010010010#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- nop
+        2#1010110010#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- nop
+        2#1011010010#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- nop
+        2#1011110010#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- nop
+        2#0001111100#  =>       (ALU,  NONE, OP_OR,        NONE,       RB,          RS,   RA,   '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- nor
+        2#0110111100#  =>       (ALU,  NONE, OP_OR,        NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- or
+        2#0110011100#  =>       (ALU,  NONE, OP_OR,        NONE,       RB,          RS,   RA,   '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- orc
+        2#0001111010#  =>       (ALU,  NONE, OP_POPCNT,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- popcntb
+        2#0111111010#  =>       (ALU,  NONE, OP_POPCNT,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- popcntd
+        2#0101111010#  =>       (ALU,  NONE, OP_POPCNT,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- popcntw
+        2#0010111010#  =>       (ALU,  NONE, OP_PRTY,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- prtyd
+        2#0010011010#  =>       (ALU,  NONE, OP_PRTY,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- prtyw
+        2#0010000000#  =>       (ALU,  NONE, OP_SETB,      NONE,       NONE,        NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- setb
+        2#0111110010#  =>       (LDST, NONE, OP_TLBIE,     NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- slbia
+        2#0000011011#  =>       (ALU,  NONE, OP_SHL,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- sld
+        2#0000011000#  =>       (ALU,  NONE, OP_SHL,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- slw
+        2#1100011010#  =>       (ALU,  NONE, OP_SHR,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0', NONE), -- srad
+        2#1100111010#  =>       (ALU,  NONE, OP_SHR,       NONE,       CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0', NONE), -- sradi
+        2#1100111011#  =>       (ALU,  NONE, OP_SHR,       NONE,       CONST_SH,    RS,   RA,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0', NONE), -- sradi
+        2#1100011000#  =>       (ALU,  NONE, OP_SHR,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0', NONE), -- sraw
+        2#1100111000#  =>       (ALU,  NONE, OP_SHR,       NONE,       CONST_SH32,  RS,   RA,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '1', '1', RC,   '0', '0', NONE), -- srawi
+        2#1000011011#  =>       (ALU,  NONE, OP_SHR,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- srd
+        2#1000011000#  =>       (ALU,  NONE, OP_SHR,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- srw
+        2#1111010101#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stbcix
+        2#1010110110#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', ONE,  '0', '0', NONE), -- stbcx
+        2#0011110111#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stbux
+        2#0011010111#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stbx
+        2#1010010100#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stdbrx
+        2#1111110101#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stdcix
+        2#0011010110#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE,  '0', '0', NONE), -- stdcx
+        2#0010110101#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stdux
+        2#0010010101#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stdx
+        2#1011010111#  =>       (LDST, FPU,  OP_STORE,     RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stfdx
+        2#1011110111#  =>       (LDST, FPU,  OP_STORE,     RA_OR_ZERO, RB,          FRS,  RA,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stfdux
+        2#1111010111#  =>       (LDST, FPU,  OP_STORE,     RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stfiwx
+        2#1010010111#  =>       (LDST, FPU,  OP_STORE,     RA_OR_ZERO, RB,          FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- stfsx
+        2#1010110111#  =>       (LDST, FPU,  OP_STORE,     RA_OR_ZERO, RB,          FRS,  RA,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0', NONE), -- stfsux
+        2#1110010110#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- sthbrx
+        2#1110110101#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- sthcix
+        2#1011010110#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE,  '0', '0', NONE), -- sthcx
+        2#0110110111#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- sthux
+        2#0110010111#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- sthx
+        2#0010110110#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE,  '0', '0', DRSE), -- stqcx
+        2#1010010110#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stwbrx
+        2#1110010101#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stwcix
+        2#0010010110#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', ONE,  '0', '0', NONE), -- stwcx
+        2#0010110111#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stwux
+        2#0010010111#  =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stwx
+        2#0000101000#  =>       (ALU,  NONE, OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- subf
+        2#1000101000#  =>       (ALU,  NONE, OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- subfo
+        2#0000001000#  =>       (ALU,  NONE, OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', ONE,  '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- subfc
+        2#1000001000#  =>       (ALU,  NONE, OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', ONE,  '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- subfco
+        2#0010001000#  =>       (ALU,  NONE, OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- subfe
+        2#1010001000#  =>       (ALU,  NONE, OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- subfeo
+        2#0011101000#  =>       (ALU,  NONE, OP_ADD,       RA,         CONST_M1,    NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- subfme
+        2#1011101000#  =>       (ALU,  NONE, OP_ADD,       RA,         CONST_M1,    NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- subfmeo
+        2#0011001000#  =>       (ALU,  NONE, OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- subfze
+        2#1011001000#  =>       (ALU,  NONE, OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- subfzeo
+        2#1001010110#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- sync
+        2#0001000100#  =>       (ALU,  NONE, OP_TRAP,      RA,         RB,          NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- td
+        2#0000000100#  =>       (ALU,  NONE, OP_TRAP,      RA,         RB,          NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- tw
+        2#0100110010#  =>       (LDST, NONE, OP_TLBIE,     NONE,       RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- tlbie
+        2#0100010010#  =>       (LDST, NONE, OP_TLBIE,     NONE,       RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- tlbiel
+        2#1000110110#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- tlbsync
+        2#0000011110#  =>       (ALU,  NONE, OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- wait
+        2#0100111100#  =>       (ALU,  NONE, OP_XOR,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- xor
        others => illegal_inst
 	);

    constant decode_op_58_array : minor_rom_array_2_t := (
-        --              unit    internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
-        --                           op                                            in   out   A   out  in    out  len        ext                                 pipe
-        0     =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_DS,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ld
-        1     =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_DS,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- ldu
-        2     =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_DS,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lwa
+        --              unit  fac   internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl  rpt
+        --                               op                                            in   out   A   out  in    out  len        ext                                 pipe
+        0     =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, CONST_DS,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- ld
+        1     =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, CONST_DS,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- ldu
+        2     =>       (LDST, NONE, OP_LOAD,      RA_OR_ZERO, CONST_DS,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwa
        others   => decode_rom_init
        );

    constant decode_op_59_array : op_59_subop_array_t := (
-        --             unit   internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
-        --                          op                               in   out   A   out  in    out  len        ext                                pipe
-        2#01110#  =>  (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fcfid[u]s
-        2#10010#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fdivs
-        2#10100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fsubs
-        2#10101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fadds
-        2#10110#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fsqrts
-        2#11000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fres
-        2#11001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmuls
-        2#11010#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- frsqrtes
-        2#11100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmsubs
-        2#11101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmadds
-        2#11110#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fnmsubs
-        2#11111#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fnmadds
+        --             unit fac  internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl  rpt
+        --                             op                               in   out   A   out  in    out  len        ext                                pipe
+        2#01110#  =>  (FPU, FPU, OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- fcfid[u]s
+        2#10010#  =>  (FPU, FPU, OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- fdivs
+        2#10100#  =>  (FPU, FPU, OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- fsubs
+        2#10101#  =>  (FPU, FPU, OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- fadds
+        2#10110#  =>  (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- fsqrts
+        2#11000#  =>  (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- fres
+        2#11001#  =>  (FPU, FPU, OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- fmuls
+        2#11010#  =>  (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- frsqrtes
+        2#11100#  =>  (FPU, FPU, OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- fmsubs
+        2#11101#  =>  (FPU, FPU, OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- fmadds
+        2#11110#  =>  (FPU, FPU, OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- fnmsubs
+        2#11111#  =>  (FPU, FPU, OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), -- fnmadds
        others => illegal_inst
        );

    constant decode_op_62_array : minor_rom_array_2_t := (
-        --              unit    internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
-        --                            op                                           in   out   A   out  in    out  len        ext                                 pipe
-        0     =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_DS,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- std
-        1     =>       (LDST,   OP_STORE,     RA_OR_ZERO, CONST_DS,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stdu
+        --              unit  fac   internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl  rpt
+        --                                op                                           in   out   A   out  in    out  len        ext                                 pipe
+        0     =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, CONST_DS,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- std
+        1     =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, CONST_DS,    RS,   RA,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stdu
+        2     =>       (LDST, NONE, OP_STORE,     RA_OR_ZERO, CONST_DS,    RS,   NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', DRSE), -- stq
        others   => decode_rom_init
        );

    -- indexed by bits 4..1 and 10..6 of instruction word
    constant decode_op_63l_array : op_63_subop_array_0_t := (
-        --                unit   internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
-        --                             op                               in   out   A   out  in    out  len        ext                                pipe
-        2#000000000#  => (FPU,   OP_FPOP,       FRA,  FRB,  NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  0/0=fcmpu
-        2#000000001#  => (FPU,   OP_FPOP,       FRA,  FRB,  NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  1/0=fcmpo
-        2#000000010#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  2/0=mcrfs
-        2#000000100#  => (FPU,   OP_FPOP,       FRA,  FRB,  NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  4/0=ftdiv
-        2#000000101#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), --  5/0=ftsqrt
-        2#011000001#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  1/6=mtfsb1
-        2#011000010#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  2/6=mtfsb0
-        2#011000100#  => (FPU,   OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/6=mtfsfi
-        2#011011010#  => (FPU,   OP_FPOP_I,     FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- 26/6=fmrgow
-        2#011011110#  => (FPU,   OP_FPOP_I,     FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- 30/6=fmrgew
-        2#011110010#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 18/7=mffs family
-        2#011110110#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 22/7=mtfsf
-        2#100000000#  => (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  0/8=fcpsgn
-        2#100000001#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  1/8=fneg
-        2#100000010#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  2/8=fmr
-        2#100000100#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/8=fnabs
-        2#100001000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  8/8=fabs
-        2#100001100#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 12/8=frin
-        2#100001101#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 13/8=friz
-        2#100001110#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 14/8=frip
-        2#100001111#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 15/8=frim
-        2#110000000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), --  0/12=frsp
-        2#111000000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  0/14=fctiw
-        2#111000100#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/14=fctiwu
-        2#111011001#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 25/14=fctid
-        2#111011010#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 26/14=fcfid
-        2#111011101#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 29/14=fctidu
-        2#111011110#  => (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 30/14=fcfidu
-        2#111100000#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  0/15=fctiwz
-        2#111100100#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), --  4/15=fctiwuz
-        2#111111001#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 25/15=fctidz
-        2#111111101#  => (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- 29/15=fctiduz
+        --                unit fac  internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl  rpt
+        --                                op                               in   out   A   out  in    out  len        ext                                pipe
+        2#000000000#  => (FPU, FPU, OP_FPOP,       FRA,  FRB,  NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), --  0/0=fcmpu
+        2#000000001#  => (FPU, FPU, OP_FPOP,       FRA,  FRB,  NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), --  1/0=fcmpo
+        2#000000010#  => (FPU, FPU, OP_FPOP,       NONE, NONE, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), --  2/0=mcrfs
+        2#000000100#  => (FPU, FPU, OP_FPOP,       FRA,  FRB,  NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), --  4/0=ftdiv
+        2#000000101#  => (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), --  5/0=ftsqrt
+        2#011000001#  => (FPU, FPU, OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), --  1/6=mtfsb1
+        2#011000010#  => (FPU, FPU, OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), --  2/6=mtfsb0
+        2#011000100#  => (FPU, FPU, OP_FPOP,       NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), --  4/6=mtfsfi
+        2#011011010#  => (FPU, FPU, OP_FPOP_I,     FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- 26/6=fmrgow
+        2#011011110#  => (FPU, FPU, OP_FPOP_I,     FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- 30/6=fmrgew
+        2#011110010#  => (FPU, FPU, OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- 18/7=mffs family
+        2#011110110#  => (FPU, FPU, OP_FPOP_I,     NONE, FRB,  NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- 22/7=mtfsf
+        2#100000000#  => (FPU, FPU, OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), --  0/8=fcpsgn
+        2#100000001#  => (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), --  1/8=fneg
+        2#100000010#  => (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), --  2/8=fmr
+        2#100000100#  => (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), --  4/8=fnabs
+        2#100001000#  => (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), --  8/8=fabs
+        2#100001100#  => (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- 12/8=frin
+        2#100001101#  => (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- 13/8=friz
+        2#100001110#  => (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- 14/8=frip
+        2#100001111#  => (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- 15/8=frim
+        2#110000000#  => (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE), --  0/12=frsp
+        2#111000000#  => (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), --  0/14=fctiw
+        2#111000100#  => (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), --  4/14=fctiwu
+        2#111011001#  => (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- 25/14=fctid
+        2#111011010#  => (FPU, FPU, OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- 26/14=fcfid
+        2#111011101#  => (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- 29/14=fctidu
+        2#111011110#  => (FPU, FPU, OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- 30/14=fcfidu
+        2#111100000#  => (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), --  0/15=fctiwz
+        2#111100100#  => (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), --  4/15=fctiwuz
+        2#111111001#  => (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- 25/15=fctidz
+        2#111111101#  => (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- 29/15=fctiduz
        others => illegal_inst
        );

    -- indexed by bits 4..1 of instruction word
    constant decode_op_63h_array : op_63_subop_array_1_t := (
-        --            unit   internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
-        --                         op                               in   out   A   out  in    out  len        ext                                pipe
-        2#0010#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fdiv
-        2#0100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsub
-        2#0101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fadd
-        2#0110#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsqrt
-        2#0111#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsel
-        2#1000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fre
-        2#1001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmul
-        2#1010#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- frsqrte
-        2#1100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmsub
-        2#1101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmadd
-        2#1110#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fnmsub
-        2#1111#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fnmadd
+        --            unit fac  internal       in1   in2   in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl  rpt
+        --                            op                               in   out   A   out  in    out  len        ext                                pipe
+        2#0010#  =>  (FPU, FPU, OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- fdiv
+        2#0100#  =>  (FPU, FPU, OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- fsub
+        2#0101#  =>  (FPU, FPU, OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- fadd
+        2#0110#  =>  (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- fsqrt
+        2#0111#  =>  (FPU, FPU, OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- fsel
+        2#1000#  =>  (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- fre
+        2#1001#  =>  (FPU, FPU, OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- fmul
+        2#1010#  =>  (FPU, FPU, OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- frsqrte
+        2#1100#  =>  (FPU, FPU, OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- fmsub
+        2#1101#  =>  (FPU, FPU, OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- fmadd
+        2#1110#  =>  (FPU, FPU, OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- fnmsub
+        2#1111#  =>  (FPU, FPU, OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE), -- fnmadd
        others => illegal_inst
        );

-    --                                        unit   internal         in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
-    --                                                     op                                              in   out   A   out  in    out  len        ext                                 pipe
-    constant nop_instr      : decode_rom_t := (ALU,  OP_NOP,          NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0');
-    constant fetch_fail_inst: decode_rom_t := (LDST, OP_FETCH_FAILED, NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0');
+    --                                        unit   fac   internal         in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl  rpt
+    --                                                           op                                              in   out   A   out  in    out  len        ext                                 pipe
+    constant nop_instr      : decode_rom_t := (ALU,  NONE, OP_NOP,          NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE);
+    constant fetch_fail_inst: decode_rom_t := (LDST, NONE, OP_FETCH_FAILED, NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE);

 begin
    decode1_0: process(clk)
@ -529,6 +546,13 @@ begin
                    ri <= ri_in;
                end if;
            end if;
+            if rst = '1' then
+                br.br_nia <= (others => '0');
+                br.br_offset <= (others => '0');
+                br.predict <= '0';
+            else
+                br <= br_in;
+            end if;
        end if;
    end process;
    busy_out <= s.valid;
@ -536,14 +560,13 @@ begin
    decode1_1: process(all)
        variable v : Decode1ToDecode2Type;
        variable vi : reg_internal_t;
-        variable f : Decode1ToFetch1Type;
        variable majorop : major_opcode_t;
        variable minor4op : std_ulogic_vector(10 downto 0);
        variable op_19_bits: std_ulogic_vector(2 downto 0);
        variable sprn : spr_num_t;
-        variable br_nia    : std_ulogic_vector(61 downto 0);
        variable br_target : std_ulogic_vector(61 downto 0);
        variable br_offset : signed(23 downto 0);
+        variable bv : br_predictor_t;
    begin
        v := Decode1ToDecode2Init;
        vi := reg_internal_t_init;
@ -552,6 +575,7 @@ begin
        v.nia  := f_in.nia;
        v.insn := f_in.insn;
        v.stop_mark := f_in.stop_mark;
+        v.big_endian := f_in.big_endian;

        if f_in.valid = '1' then
            report "Decode insn " & to_hstring(f_in.insn) & " at " & to_hstring(f_in.nia);
@ -573,9 +597,10 @@ begin
            -- major opcode 31, lots of things
            v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1))));

-            -- Work out ispr1/ispr2 independent of v.decode since they seem to be critical path
+            -- Work out ispr1/ispro independent of v.decode since they seem to be critical path
            sprn := decode_spr_num(f_in.insn);
            v.ispr1 := fast_spr_num(sprn);
+            v.ispro := fast_spr_num(sprn);

            if std_match(f_in.insn(10 downto 1), "01-1010011") then
                -- mfspr or mtspr
@ -584,18 +609,28 @@ begin
                    vi.force_single := '1';
                    -- send MMU-related SPRs to loadstore1
                    case sprn is
-                        when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PRTBL =>
+                        when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PTCR =>
                            vi.override_decode.unit := LDST;
                            vi.override_unit := '1';
                        when others =>
                    end case;
                end if;
            end if;
+            if std_match(f_in.insn(10 downto 1), "0100010100") then
+                -- lqarx, illegal if RA = RT or RB = RT
+                if f_in.insn(25 downto 21) = f_in.insn(20 downto 16) or
+                    f_in.insn(25 downto 21) = f_in.insn(15 downto 11) then
+                    vi.override := '1';
+                end if;
+            end if;

        when 16 =>
            -- CTR may be needed as input to bc
            if f_in.insn(23) = '0' then
                v.ispr1 := fast_spr_num(SPR_CTR);
+                v.ispro := fast_spr_num(SPR_CTR);
+            elsif f_in.insn(0) = '1' then
+                v.ispro := fast_spr_num(SPR_LR);
            end if;
            -- Predict backward branches as taken, forward as untaken
            v.br_pred := f_in.insn(15);
@ -605,6 +640,9 @@ begin
            -- Unconditional branches are always taken
            v.br_pred := '1';
            br_offset := signed(f_in.insn(25 downto 2));
+            if f_in.insn(0) = '1' then
+                v.ispro := fast_spr_num(SPR_LR);
+            end if;

        when 19 =>
            vi.override := not decode_op_19_valid(to_integer(unsigned(f_in.insn(5 downto 1) & f_in.insn(10 downto 6))));
@ -617,8 +655,12 @@ begin
                -- Branch uses CTR as condition when BO(2) is 0. This is
                -- also used to indicate that CTR is modified (they go
                -- together).
-                if f_in.insn(23) = '0' then
+                -- bcctr doesn't update CTR or use it in the branch condition
+                if f_in.insn(23) = '0' and (f_in.insn(10) = '0' or f_in.insn(6) = '1') then
                    v.ispr1 := fast_spr_num(SPR_CTR);
+                    v.ispro := fast_spr_num(SPR_CTR);
+                elsif f_in.insn(0) = '1' then
+                    v.ispro := fast_spr_num(SPR_LR);
                end if;
                if f_in.insn(10) = '0' then
                    v.ispr2 := fast_spr_num(SPR_LR);
@ -633,10 +675,7 @@ begin
                v.ispr2 := fast_spr_num(SPR_SRR0);
            end if;

-        when 30 =>
-            v.decode := decode_op_30_array(to_integer(unsigned(f_in.insn(4 downto 1))));
-
-        when 48 =>
+        when 24 =>
            -- ori, special-case the standard NOP
            if std_match(f_in.insn, "01100000000000000000000000000000") then
                report "PPC_nop";
@ -644,6 +683,15 @@ begin
                vi.override_decode := nop_instr;
            end if;

+        when 30 =>
+            v.decode := decode_op_30_array(to_integer(unsigned(f_in.insn(4 downto 1))));
+
+        when 56 =>
+            -- lq, illegal if RA = RT
+            if f_in.insn(25 downto 21) = f_in.insn(20 downto 16) then
+                vi.override := '1';
+            end if;
+
        when 58 =>
            v.decode := decode_op_58_array(to_integer(unsigned(f_in.insn(1 downto 0))));

@ -685,17 +733,24 @@ begin
        -- Branch predictor
        -- Note bclr, bcctr and bctar are predicted not taken as we have no
        -- count cache or link stack.
-        br_nia := f_in.nia(63 downto 2);
+        bv.br_nia := f_in.nia(63 downto 2);
        if f_in.insn(1) = '1' then
-            br_nia := (others => '0');
+            bv.br_nia := (others => '0');
+        end if;
+        bv.br_offset := br_offset;
+        if f_in.next_predicted = '1' then
+            v.br_pred := '1';
+        elsif f_in.next_pred_ntaken = '1' then
+            v.br_pred := '0';
        end if;
-        br_target := std_ulogic_vector(signed(br_nia) + br_offset);
-        f.redirect := v.br_pred and f_in.valid and not flush_in and not s.valid;
-        f.redirect_nia := br_target & "00";
+        bv.predict := v.br_pred and f_in.valid and not flush_in and not busy_out and not f_in.next_predicted;
+        -- after a clock edge...
+        br_target := std_ulogic_vector(signed(br.br_nia) + br.br_offset);

        -- Update registers
        rin <= v;
        ri_in <= vi;
+        br_in <= bv;

        -- Update outputs
        d_out <= r;
@ -707,8 +762,9 @@ begin
        if ri.force_single = '1' then
            d_out.decode.sgl_pipe <= '1';
        end if;
-        f_out <= f;
-        flush_out <= f.redirect;
+        f_out.redirect <= br.predict;
+        f_out.redirect_nia <= br_target & "00";
+        flush_out <= bv.predict or br.predict;
    end process;

    d1_log: if LOG_LENGTH > 0 generate
--- a/decode2.vhdl
+++ b/decode2.vhdl
@ -19,7 +19,7 @@ entity decode2 is
        clk   : in std_ulogic;
        rst   : in std_ulogic;

-        complete_in : in std_ulogic;
+        complete_in : in instr_tag_t;
        busy_in   : in std_ulogic;
        stall_out : out std_ulogic;

@ -37,6 +37,9 @@ entity decode2 is
        c_in  : in CrFileToDecode2Type;
        c_out : out Decode2ToCrFileType;

+        execute_bypass    : in bypass_data_t;
+        execute_cr_bypass : in cr_bypass_data_t;
+
        log_out : out std_ulogic_vector(9 downto 0)
 	);
 end entity decode2;
@ -44,6 +47,7 @@ end entity decode2;
 architecture behaviour of decode2 is
    type reg_type is record
        e : Decode2ToExecute1Type;
+        repeat : std_ulogic;
    end record;

    signal r, rin : reg_type;
@ -115,6 +119,8 @@ architecture behaviour of decode2 is
                ret := ('0', (others => '0'), std_ulogic_vector(resize(signed(insn_bd(insn_in)) & "00", 64)));
            when CONST_DS =>
                ret := ('0', (others => '0'), std_ulogic_vector(resize(signed(insn_ds(insn_in)) & "00", 64)));
+            when CONST_DQ =>
+                ret := ('0', (others => '0'), std_ulogic_vector(resize(signed(insn_dq(insn_in)) & "0000", 64)));
            when CONST_DXHI4 =>
                ret := ('0', (others => '0'), std_ulogic_vector(resize(signed(insn_dx(insn_in)) & x"0004", 64)));
            when CONST_M1 =>
@ -202,54 +208,99 @@ architecture behaviour of decode2 is
        end case;
    end;

-    -- For now, use "rc" in the decode table to decide whether oe exists.
-    -- This is not entirely correct architecturally: For mulhd and
-    -- mulhdu, the OE field is reserved. It remains to be seen what an
-    -- actual POWER9 does if we set it on those instructions, for now we
-    -- test that further down when assigning to the multiplier oe input.
-    --
-    function decode_oe (t : rc_t; insn_in : std_ulogic_vector(31 downto 0)) return std_ulogic is
-    begin
-        case t is
-            when RC =>
-                return insn_oe(insn_in);
-            when OTHERS =>
-                return '0';
-        end case;
-    end;
+    -- control signals that are derived from insn_type
+    type mux_select_array_t is array(insn_type_t) of std_ulogic_vector(2 downto 0);
+
+    constant result_select : mux_select_array_t := (
+        OP_AND      => "001",           -- logical_result
+        OP_OR       => "001",
+        OP_XOR      => "001",
+        OP_PRTY     => "001",
+        OP_CMPB     => "001",
+        OP_EXTS     => "001",
+        OP_BPERM    => "001",
+        OP_BCD      => "001",
+        OP_MTSPR    => "001",
+        OP_RLC      => "010",           -- rotator_result
+        OP_RLCL     => "010",
+        OP_RLCR     => "010",
+        OP_SHL      => "010",
+        OP_SHR      => "010",
+        OP_EXTSWSLI => "010",
+        OP_MUL_L64  => "011",           -- muldiv_result
+        OP_MUL_H64  => "011",
+        OP_MUL_H32  => "011",
+        OP_DIV      => "011",
+        OP_DIVE     => "011",
+        OP_MOD      => "011",
+        OP_CNTZ     => "100",           -- countbits_result
+        OP_POPCNT   => "100",
+        OP_MFSPR    => "101",           -- spr_result
+        OP_B        => "110",           -- next_nia
+        OP_BC       => "110",
+        OP_BCREG    => "110",
+        OP_ADDG6S   => "111",           -- misc_result
+        OP_ISEL     => "111",
+        OP_DARN     => "111",
+        OP_MFMSR    => "111",
+        OP_MFCR     => "111",
+        OP_SETB     => "111",
+        others      => "000"            -- default to adder_result
+        );
+
+    constant subresult_select : mux_select_array_t := (
+        OP_MUL_L64 => "000",            -- muldiv_result
+        OP_MUL_H64 => "001",
+        OP_MUL_H32 => "010",
+        OP_DIV     => "011",
+        OP_DIVE    => "011",
+        OP_MOD     => "011",
+        OP_ADDG6S  => "001",            -- misc_result
+        OP_ISEL    => "010",
+        OP_DARN    => "011",
+        OP_MFMSR   => "100",
+        OP_MFCR    => "101",
+        OP_SETB    => "110",
+        OP_CMP     => "000",            -- cr_result
+        OP_CMPRB   => "001",
+        OP_CMPEQB  => "010",
+        OP_CROP    => "011",
+        OP_MCRXRX  => "100",
+        OP_MTCRF   => "101",
+        others     => "000"
+        );

    -- issue control signals
    signal control_valid_in : std_ulogic;
    signal control_valid_out : std_ulogic;
+    signal control_stall_out : std_ulogic;
    signal control_sgl_pipe : std_logic;

    signal gpr_write_valid : std_ulogic;
    signal gpr_write : gspr_index_t;
-    signal gpr_bypassable  : std_ulogic;
-
-    signal update_gpr_write_valid : std_ulogic;
-    signal update_gpr_write_reg : gspr_index_t;

    signal gpr_a_read_valid : std_ulogic;
-    signal gpr_a_read :gspr_index_t;
-    signal gpr_a_bypass : std_ulogic;
+    signal gpr_a_read       : gspr_index_t;
+    signal gpr_a_bypass     : std_ulogic;

    signal gpr_b_read_valid : std_ulogic;
-    signal gpr_b_read : gspr_index_t;
-    signal gpr_b_bypass : std_ulogic;
+    signal gpr_b_read       : gspr_index_t;
+    signal gpr_b_bypass     : std_ulogic;

    signal gpr_c_read_valid : std_ulogic;
-    signal gpr_c_read : gspr_index_t;
-    signal gpr_c_bypass : std_ulogic;
+    signal gpr_c_read       : gspr_index_t;
+    signal gpr_c_bypass     : std_ulogic;

+    signal cr_read_valid   : std_ulogic;
    signal cr_write_valid  : std_ulogic;
    signal cr_bypass       : std_ulogic;
-    signal cr_bypass_avail : std_ulogic;
+
+    signal instr_tag       : instr_tag_t;

 begin
    control_0: entity work.control
 	generic map (
-            PIPELINE_DEPTH => 1
+            EX1_BYPASS => EX1_BYPASS
            )
 	port map (
            clk         => clk,
@ -257,6 +308,7 @@ begin

            complete_in => complete_in,
            valid_in    => control_valid_in,
+            repeated    => r.repeat,
            busy_in     => busy_in,
            deferred    => deferred,
            flush_in    => flush_in,
@ -265,10 +317,6 @@ begin

            gpr_write_valid_in => gpr_write_valid,
            gpr_write_in       => gpr_write,
-            gpr_bypassable     => gpr_bypassable,
-
-            update_gpr_write_valid => update_gpr_write_valid,
-            update_gpr_write_reg => update_gpr_write_reg,

            gpr_a_read_valid_in  => gpr_a_read_valid,
            gpr_a_read_in        => gpr_a_read,
@ -279,18 +327,22 @@ begin
            gpr_c_read_valid_in  => gpr_c_read_valid,
            gpr_c_read_in        => gpr_c_read,

-            cr_read_in           => d_in.decode.input_cr,
+            execute_next_tag     => execute_bypass.tag,
+            execute_next_cr_tag  => execute_cr_bypass.tag,
+
+            cr_read_in           => cr_read_valid,
            cr_write_in          => cr_write_valid,
            cr_bypass            => cr_bypass,
-            cr_bypassable        => cr_bypass_avail,

            valid_out   => control_valid_out,
-            stall_out   => stall_out,
+            stall_out   => control_stall_out,
            stopped_out => stopped_out,

            gpr_bypass_a => gpr_a_bypass,
            gpr_bypass_b => gpr_b_bypass,
-            gpr_bypass_c => gpr_c_bypass
+            gpr_bypass_c => gpr_c_bypass,
+
+            instr_tag_out => instr_tag
            );

    deferred <= r.e.valid and busy_in;
@ -307,17 +359,6 @@ begin
        end if;
    end process;

-    r_out.read1_reg <= d_in.ispr1 when d_in.decode.input_reg_a = SPR
-                       else fpr_to_gspr(insn_fra(d_in.insn)) when d_in.decode.input_reg_a = FRA and HAS_FPU
-                       else gpr_to_gspr(insn_ra(d_in.insn));
-    r_out.read2_reg <= d_in.ispr2 when d_in.decode.input_reg_b = SPR
-                       else fpr_to_gspr(insn_frb(d_in.insn)) when d_in.decode.input_reg_b = FRB and HAS_FPU
-                       else gpr_to_gspr(insn_rb(d_in.insn));
-    r_out.read3_reg <= gpr_to_gspr(insn_rcreg(d_in.insn)) when d_in.decode.input_reg_c = RCR
-                       else fpr_to_gspr(insn_frc(d_in.insn)) when d_in.decode.input_reg_c = FRC and HAS_FPU
-                       else fpr_to_gspr(insn_frt(d_in.insn)) when d_in.decode.input_reg_c = FRS and HAS_FPU
-                       else gpr_to_gspr(insn_rs(d_in.insn));
-
    c_out.read <= d_in.decode.input_cr;

    decode2_1: process(all)
@ -329,6 +370,7 @@ begin
        variable decoded_reg_c : decode_input_reg_t;
        variable decoded_reg_o : decode_output_reg_t;
        variable length : std_ulogic_vector(3 downto 0);
+        variable op : insn_type_t;
    begin
        v := r;

@ -339,16 +381,71 @@ begin

        --v.e.input_cr := d_in.decode.input_cr;
        v.e.output_cr := d_in.decode.output_cr;
-        
+
+        -- Work out whether XER common bits are set
+        v.e.output_xer := d_in.decode.output_carry;
+        case d_in.decode.insn_type is
+            when OP_ADD | OP_MUL_L64 | OP_DIV | OP_DIVE =>
+                -- OE field is valid in OP_ADD/OP_MUL_L64 with major opcode 31 only
+                if d_in.insn(31 downto 26) = "011111" and insn_oe(d_in.insn) = '1' then
+                    v.e.oe := '1';
+                    v.e.output_xer := '1';
+                end if;
+            when OP_MTSPR =>
+                if decode_spr_num(d_in.insn) = SPR_XER then
+                    v.e.output_xer := '1';
+                end if;
+            when others =>
+        end case;
+
        decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data, d_in.ispr1,
                                             d_in.nia);
        decoded_reg_b := decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, r_in.read2_data, d_in.ispr2);
        decoded_reg_c := decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn, r_in.read3_data);
-        decoded_reg_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispr1);
+        decoded_reg_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispro);
+
+        if d_in.decode.lr = '1' then
+            v.e.lr := insn_lk(d_in.insn);
+            -- b and bc have even major opcodes; bcreg is considered absolute
+            v.e.br_abs := insn_aa(d_in.insn) or d_in.insn(26);
+        end if;
+        op := d_in.decode.insn_type;
+
+        if d_in.decode.repeat /= NONE then
+            v.e.repeat := '1';
+            v.e.second := r.repeat;
+            case d_in.decode.repeat is
+                when DRSE =>
+                    -- do RS|1,RS for LE; RS,RS|1 for BE
+                    if r.repeat = d_in.big_endian then
+                        decoded_reg_c.reg(0) := '1';
+                    end if;
+                when DRTE =>
+                    -- do RT|1,RT for LE; RT,RT|1 for BE
+                    if r.repeat = d_in.big_endian then
+                        decoded_reg_o.reg(0) := '1';
+                    end if;
+                when DUPD =>
+                    -- update-form loads, 2nd instruction writes RA
+                    if r.repeat = '1' then
+                        decoded_reg_o.reg := decoded_reg_a.reg;
+                    end if;
+                when others =>
+            end case;
+        elsif v.e.lr = '1' and decoded_reg_a.reg_valid = '1' then
+            -- bcl/bclrl/bctarl that needs to write both CTR and LR has to be doubled
+            v.e.repeat := '1';
+            v.e.second := r.repeat;
+            -- first one does CTR, second does LR
+            decoded_reg_o.reg(0) := not r.repeat;
+        end if;

        r_out.read1_enable <= decoded_reg_a.reg_valid and d_in.valid;
+        r_out.read1_reg    <= decoded_reg_a.reg;
        r_out.read2_enable <= decoded_reg_b.reg_valid and d_in.valid;
+        r_out.read2_reg    <= decoded_reg_b.reg;
        r_out.read3_enable <= decoded_reg_c.reg_valid and d_in.valid;
+        r_out.read3_reg    <= decoded_reg_c.reg;

        case d_in.decode.length is
            when is1B =>
@ -366,32 +463,22 @@ begin
        -- execute unit
        v.e.nia := d_in.nia;
        v.e.unit := d_in.decode.unit;
-        v.e.insn_type := d_in.decode.insn_type;
+        v.e.fac := d_in.decode.facility;
+        v.e.instr_tag := instr_tag;
        v.e.read_reg1 := decoded_reg_a.reg;
-        v.e.read_data1 := decoded_reg_a.data;
-        v.e.bypass_data1 := gpr_a_bypass;
        v.e.read_reg2 := decoded_reg_b.reg;
-        v.e.read_data2 := decoded_reg_b.data;
-        v.e.bypass_data2 := gpr_b_bypass;
-        v.e.read_data3 := decoded_reg_c.data;
-        v.e.bypass_data3 := gpr_c_bypass;
        v.e.write_reg := decoded_reg_o.reg;
+        v.e.write_reg_enable := decoded_reg_o.reg_valid;
        v.e.rc := decode_rc(d_in.decode.rc, d_in.insn);
-        if not (d_in.decode.insn_type = OP_MUL_H32 or d_in.decode.insn_type = OP_MUL_H64) then
-            v.e.oe := decode_oe(d_in.decode.rc, d_in.insn);
-        end if;
-        v.e.cr := c_in.read_cr_data;
-        v.e.bypass_cr := cr_bypass;
        v.e.xerc := c_in.read_xerc_data;
        v.e.invert_a := d_in.decode.invert_a;
+        v.e.addm1 := '0';
+        v.e.insn_type := op;
        v.e.invert_out := d_in.decode.invert_out;
        v.e.input_carry := d_in.decode.input_carry;
        v.e.output_carry := d_in.decode.output_carry;
        v.e.is_32bit := d_in.decode.is_32bit;
        v.e.is_signed := d_in.decode.is_signed;
-        if d_in.decode.lr = '1' then
-            v.e.lr := insn_lk(d_in.insn);
-        end if;
        v.e.insn := d_in.insn;
        v.e.data_len := length;
        v.e.byte_reverse := d_in.decode.byte_reverse;
@ -399,24 +486,48 @@ begin
        v.e.update := d_in.decode.update;
        v.e.reserve := d_in.decode.reserve;
        v.e.br_pred := d_in.br_pred;
+        v.e.result_sel := result_select(op);
+        v.e.sub_select := subresult_select(op);
+        if op = OP_BC or op = OP_BCREG then
+            if d_in.insn(23) = '0' and r.repeat = '0' and
+                not (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then
+                -- decrement CTR if BO(2) = 0 and not bcctr
+                v.e.addm1 := '1';
+                v.e.result_sel := "000";        -- select adder output
+            end if;
+        end if;
+
+        -- See if any of the operands can get their value via the bypass path.
+        case gpr_a_bypass is
+            when '1' =>
+                v.e.read_data1 := execute_bypass.data;
+            when others =>
+                v.e.read_data1 := decoded_reg_a.data;
+        end case;
+        case gpr_b_bypass is
+            when '1' =>
+                v.e.read_data2 := execute_bypass.data;
+            when others =>
+                v.e.read_data2 := decoded_reg_b.data;
+        end case;
+        case gpr_c_bypass is
+            when '1' =>
+                v.e.read_data3 := execute_bypass.data;
+            when others =>
+                v.e.read_data3 := decoded_reg_c.data;
+        end case;
+
+        v.e.cr := c_in.read_cr_data;
+        if cr_bypass = '1' then
+            v.e.cr := execute_cr_bypass.data;
+        end if;

        -- issue control
        control_valid_in <= d_in.valid;
        control_sgl_pipe <= d_in.decode.sgl_pipe;

-        gpr_write_valid <= decoded_reg_o.reg_valid;
+        gpr_write_valid <= v.e.write_reg_enable;
        gpr_write <= decoded_reg_o.reg;
-        gpr_bypassable <= '0';
-        if EX1_BYPASS and d_in.decode.unit = ALU then
-            gpr_bypassable <= '1';
-        end if;
-        update_gpr_write_valid <= d_in.decode.update;
-        update_gpr_write_reg <= decoded_reg_a.reg;
-        if v.e.lr = '1' then
-            -- there are no instructions that have both update=1 and lr=1
-            update_gpr_write_valid <= '1';
-            update_gpr_write_reg <= fast_spr_num(SPR_LR);
-        end if;

        gpr_a_read_valid <= decoded_reg_a.reg_valid;
        gpr_a_read <= decoded_reg_a.reg;
@ -428,15 +539,20 @@ begin
        gpr_c_read <= decoded_reg_c.reg;

        cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn);
-        cr_bypass_avail <= '0';
-        if EX1_BYPASS and d_in.decode.unit = ALU then
-            cr_bypass_avail <= d_in.decode.output_cr;
-        end if;
+        -- Since ops that write CR only write some of the fields,
+        -- any op that writes CR effectively also reads it.
+        cr_read_valid <= cr_write_valid or d_in.decode.input_cr;

        v.e.valid := control_valid_out;
+        if control_valid_out = '1' then
+            v.repeat := v.e.repeat and not r.repeat;
+        end if;
+
+        stall_out <= control_stall_out or v.repeat;

        if rst = '1' or flush_in = '1' then
            v.e := Decode2ToExecute1Init;
+            v.repeat := '0';
        end if;

        -- Update registers
@ -456,9 +572,9 @@ begin
                            r.e.valid &
                            stopped_out &
                            stall_out &
-                            r.e.bypass_data3 &
-                            r.e.bypass_data2 &
-                            r.e.bypass_data1;
+                            gpr_a_bypass &
+                            gpr_b_bypass &
+                            gpr_c_bypass;
            end if;
        end process;
        log_out <= log_data;
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@ -11,7 +11,6 @@ package decode_types is
                         OP_FPOP, OP_FPOP_I,
                         OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC,
 			 OP_LOAD, OP_STORE,
-                         OP_FPLOAD, OP_FPSTORE,
 			 OP_MCRXRX, OP_MFCR, OP_MFMSR, OP_MFSPR, OP_MOD,
 			 OP_MTCRF, OP_MTMSRD, OP_MTSPR, OP_MUL_L64,
 			 OP_MUL_H64, OP_MUL_H32, OP_OR,
@ -25,7 +24,7 @@ package decode_types is
 			 );
    type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA, FRA);
    type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD,
-                           CONST_DXHI4, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR, FRB);
+                           CONST_DXHI4, CONST_DS, CONST_DQ, CONST_M1, CONST_SH, CONST_SH32, SPR, FRB);
    type input_reg_c_t is (NONE, RS, RCR, FRC, FRS);
    type output_reg_a_t is (NONE, RT, RA, SPR, FRT);
    type rc_t is (NONE, ONE, RC);
@ -50,10 +49,17 @@ package decode_types is
    constant TOO_OFFSET : integer := 0;

    type unit_t is (NONE, ALU, LDST, FPU);
+    type facility_t is (NONE, FPU);
    type length_t is (NONE, is1B, is2B, is4B, is8B);

+    type repeat_t is (NONE,      -- instruction is not repeated
+                      DRSE,      -- double RS, endian twist
+                      DRTE,      -- double RT, endian twist
+                      DUPD);     -- update-form load
+
    type decode_rom_t is record
 	unit         : unit_t;
+        facility     : facility_t;
 	insn_type    : insn_type_t;
 	input_reg_a  : input_reg_a_t;
 	input_reg_b  : input_reg_b_t;
@ -83,15 +89,16 @@ package decode_types is
 	lr           : std_ulogic;

 	sgl_pipe     : std_ulogic;
+        repeat       : repeat_t;
    end record;
-    constant decode_rom_init : decode_rom_t := (unit => NONE,
+    constant decode_rom_init : decode_rom_t := (unit => NONE, facility => NONE,
 						insn_type => OP_ILLEGAL, input_reg_a => NONE,
 						input_reg_b => NONE, input_reg_c => NONE,
 						output_reg_a => NONE, input_cr => '0', output_cr => '0',
 						invert_a => '0', invert_out => '0', input_carry => ZERO, output_carry => '0',
 						length => NONE, byte_reverse => '0', sign_extend => '0',
 						update => '0', reserve => '0', is_32bit => '0',
-						is_signed => '0', rc => NONE, lr => '0', sgl_pipe => '0');
+						is_signed => '0', rc => NONE, lr => '0', sgl_pipe => '0', repeat => NONE);

 end decode_types;

--- a/divider.vhdl
+++ b/divider.vhdl
@ -42,6 +42,8 @@ begin
                quot <= (others => '0');
                running <= '0';
                count <= "0000000";
+                is_32bit <= '0';
+                overflow <= '0';
            elsif d_in.valid = '1' then
                if d_in.is_extended = '1'  then
                    dend <= '0' & d_in.dividend & x"0000000000000000";
@ -123,9 +125,9 @@ begin
    divider_out: process(clk)
    begin
        if rising_edge(clk) then
-	    d_out.valid <= '0';
+            d_out.valid <= '0';
            d_out.write_reg_data <= oresult;
-	    d_out.overflow <= did_ovf;
+            d_out.overflow <= did_ovf;
            if count = "1000000" then
                d_out.valid <= '1';
            end if;
--- a/divider_tb.vhdl
+++ b/divider_tb.vhdl
@ -1,3 +1,6 @@
+library vunit_lib;
+context vunit_lib.vunit_context;
+
 library ieee;
 use ieee.std_logic_1164.all;
 use ieee.numeric_std.all;
@ -5,10 +8,13 @@ use ieee.numeric_std.all;
 library work;
 use work.decode_types.all;
 use work.common.all;
-use work.glibc_random.all;
 use work.ppc_fx_insns.all;

+library osvvm;
+use osvvm.RandomPkg.all;
+
 entity divider_tb is
+    generic (runner_cfg : string := runner_cfg_default);
 end divider_tb;

 architecture behave of divider_tb is
@ -37,516 +43,481 @@ begin
        variable q128: std_ulogic_vector(127 downto 0);
        variable q64: std_ulogic_vector(63 downto 0);
        variable rem32: std_ulogic_vector(31 downto 0);
+        variable rnd : RandomPType;
    begin
-        rst <= '1';
-        wait for clk_period;
-        rst <= '0';
-
-        d1.valid <= '1';
-        d1.dividend <= x"0000000010001000";
-        d1.divisor  <= x"0000000000001111";
-        d1.is_signed <= '0';
-        d1.is_32bit <= '0';
-        d1.is_extended <= '0';
-        d1.is_modulus <= '0';
-        d1.neg_result <= '0';
+        rnd.InitSeed(stim_process'path_name);

-        wait for clk_period;
-        assert d2.valid = '0';
+        test_runner_setup(runner, runner_cfg);

-        d1.valid <= '0';
-
-        for j in 0 to 66 loop
+        while test_suite loop
+            rst <= '1';
            wait for clk_period;
-            if d2.valid = '1' then
-                exit;
-            end if;
-        end loop;
+            rst <= '0';

-        assert d2.valid = '1';
-        assert d2.write_reg_data = x"000000000000f001" report "result " & to_hstring(d2.write_reg_data);
+            d1.is_signed <= '0';
+            d1.neg_result <= '0';
+            d1.is_extended <= '0';
+            d1.is_32bit <= '0';
+            d1.is_modulus <= '0';
+            d1.valid <= '0';

-        wait for clk_period;
-        assert d2.valid = '0' report "valid";
+            if run("Test interface") then
+                d1.valid <= '1';
+                d1.dividend <= x"0000000010001000";
+                d1.divisor  <= x"0000000000001111";

-        d1.valid <= '1';
+                wait for clk_period;
+                check_false(?? d2.valid, result("for valid"));

-        wait for clk_period;
-        assert d2.valid = '0' report "valid";
+                d1.valid <= '0';

-        d1.valid <= '0';
+                for j in 0 to 66 loop
+                    wait for clk_period;
+                    if d2.valid = '1' then
+                        exit;
+                    end if;
+                end loop;

-        for j in 0 to 66 loop
-            wait for clk_period;
-            if d2.valid = '1' then
-                exit;
-            end if;
-        end loop;
+                check_true(?? d2.valid, result("for valid"));
+                check_equal(d2.write_reg_data, 16#f001#);

-        assert d2.valid = '1';
-        assert d2.write_reg_data = x"000000000000f001" report "result " & to_hstring(d2.write_reg_data);
+                wait for clk_period;
+                check_false(?? d2.valid, result("for valid"));

-        wait for clk_period;
-        assert d2.valid = '0';
+                d1.valid <= '1';

-        -- test divd
-        report "test divd";
-        divd_loop : for dlength in 1 to 8 loop
-            for vlength in 1 to dlength loop
-                for i in 0 to 100 loop
-                    ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
-                    rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
+                wait for clk_period;
+                check_false(?? d2.valid, result("for valid"));

-                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
-                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
-                    d1.is_signed <= '1';
-                    d1.neg_result <= ra(63) xor rb(63);
-                    d1.valid <= '1';
+                d1.valid <= '0';

+                for j in 0 to 66 loop
                    wait for clk_period;
-
-                    d1.valid <= '0';
-                    for j in 0 to 66 loop
-                        wait for clk_period;
-                        if d2.valid = '1' then
-                            exit;
-                        end if;
-                    end loop;
-                    assert d2.valid = '1';
-
-                    behave_rt := (others => '0');
-                    if rb /= x"0000000000000000" and (ra /= x"8000000000000000" or rb /= x"ffffffffffffffff") then
-                        behave_rt := ppc_divd(ra, rb);
+                    if d2.valid = '1' then
+                        exit;
                    end if;
-                    assert to_hstring(behave_rt) = to_hstring(d2.write_reg_data)
-                        report "bad divd expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
                end loop;
-            end loop;
-        end loop;

-        -- test divdu
-        report "test divdu";
-        divdu_loop : for dlength in 1 to 8 loop
-            for vlength in 1 to dlength loop
-                for i in 0 to 100 loop
-                    ra := std_ulogic_vector(resize(unsigned(pseudorand(dlength * 8)), 64));
-                    rb := std_ulogic_vector(resize(unsigned(pseudorand(vlength * 8)), 64));
-
-                    d1.dividend <= ra;
-                    d1.divisor <= rb;
-                    d1.is_signed <= '0';
-                    d1.neg_result <= '0';
-                    d1.valid <= '1';
-
-                    wait for clk_period;
-
-                    d1.valid <= '0';
-                    for j in 0 to 66 loop
-                        wait for clk_period;
-                        if d2.valid = '1' then
-                            exit;
-                        end if;
+                check_true(?? d2.valid, result("for valid"));
+                check_equal(d2.write_reg_data, 16#f001#);
+
+                wait for clk_period;
+                check_false(?? d2.valid, result("for valid"));
+
+            elsif run("Test divd") then
+                divd_loop : for dlength in 1 to 8 loop
+                    for vlength in 1 to dlength loop
+                        for i in 0 to 100 loop
+                            ra := std_ulogic_vector(resize(signed(rnd.RandSlv(dlength * 8)), 64));
+                            rb := std_ulogic_vector(resize(signed(rnd.RandSlv(vlength * 8)), 64));
+
+                            d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
+                            d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
+                            d1.is_signed <= '1';
+                            d1.neg_result <= ra(63) xor rb(63);
+                            d1.valid <= '1';
+
+                            wait for clk_period;
+
+                            d1.valid <= '0';
+                            for j in 0 to 66 loop
+                                wait for clk_period;
+                                if d2.valid = '1' then
+                                    exit;
+                                end if;
+                            end loop;
+                            check_true(?? d2.valid, result("for valid"));
+
+                            behave_rt := (others => '0');
+                            if rb /= x"0000000000000000" and (ra /= x"8000000000000000" or rb /= x"ffffffffffffffff") then
+                                behave_rt := ppc_divd(ra, rb);
+                            end if;
+                            check_equal(d2.write_reg_data, behave_rt, result("for divd"));
+                        end loop;
                    end loop;
-                    assert d2.valid = '1';
-
-                    behave_rt := (others => '0');
-                    if rb /= x"0000000000000000" then
-                        behave_rt := ppc_divdu(ra, rb);
-                    end if;
-                    assert to_hstring(behave_rt) = to_hstring(d2.write_reg_data)
-                        report "bad divdu expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
                end loop;
-            end loop;
-        end loop;
-
-        -- test divde
-        report "test divde";
-        divde_loop : for vlength in 1 to 8 loop
-            for dlength in 1 to vlength loop
-                for i in 0 to 100 loop
-                    ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
-                    rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
-
-                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
-                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
-                    d1.is_signed <= '1';
-                    d1.neg_result <= ra(63) xor rb(63);
-                    d1.is_extended <= '1';
-                    d1.valid <= '1';
-
-                    wait for clk_period;

-                    d1.valid <= '0';
-                    for j in 0 to 66 loop
-                        wait for clk_period;
-                        if d2.valid = '1' then
-                            exit;
-                        end if;
+            elsif run("Test divdu") then
+                divdu_loop : for dlength in 1 to 8 loop
+                    for vlength in 1 to dlength loop
+                        for i in 0 to 100 loop
+                            ra := std_ulogic_vector(resize(unsigned(rnd.RandSlv(dlength * 8)), 64));
+                            rb := std_ulogic_vector(resize(unsigned(rnd.RandSlv(vlength * 8)), 64));
+
+                            d1.dividend <= ra;
+                            d1.divisor <= rb;
+                            d1.valid <= '1';
+
+                            wait for clk_period;
+
+                            d1.valid <= '0';
+                            for j in 0 to 66 loop
+                                wait for clk_period;
+                                if d2.valid = '1' then
+                                    exit;
+                                end if;
+                            end loop;
+                            check_true(?? d2.valid, result("for valid"));
+
+                            behave_rt := (others => '0');
+                            if rb /= x"0000000000000000" then
+                                behave_rt := ppc_divdu(ra, rb);
+                            end if;
+                            check_equal(d2.write_reg_data, behave_rt, result("for divdu"));
+                        end loop;
                    end loop;
-                    assert d2.valid = '1';
-
-                    behave_rt := (others => '0');
-                    if rb /= x"0000000000000000" then
-                        d128 := ra & x"0000000000000000";
-                        q128 := std_ulogic_vector(signed(d128) / signed(rb));
-                        if q128(127 downto 63) = x"0000000000000000" & '0' or
-                            q128(127 downto 63) = x"ffffffffffffffff" & '1' then
-                            behave_rt := q128(63 downto 0);
-                        end if;
-                    end if;
-                    assert to_hstring(behave_rt) = to_hstring(d2.write_reg_data)
-                        report "bad divde expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data) & " for ra = " & to_hstring(ra) & " rb = " & to_hstring(rb);
                end loop;
-            end loop;
-        end loop;
-
-        -- test divdeu
-        report "test divdeu";
-        divdeu_loop : for vlength in 1 to 8 loop
-            for dlength in 1 to vlength loop
-                for i in 0 to 100 loop
-                    ra := std_ulogic_vector(resize(unsigned(pseudorand(dlength * 8)), 64));
-                    rb := std_ulogic_vector(resize(unsigned(pseudorand(vlength * 8)), 64));
-
-                    d1.dividend <= ra;
-                    d1.divisor <= rb;
-                    d1.is_signed <= '0';
-                    d1.neg_result <= '0';
-                    d1.is_extended <= '1';
-                    d1.valid <= '1';
-
-                    wait for clk_period;

-                    d1.valid <= '0';
-                    for j in 0 to 66 loop
-                        wait for clk_period;
-                        if d2.valid = '1' then
-                            exit;
-                        end if;
+            elsif run("Test divde") then
+                divde_loop : for vlength in 1 to 8 loop
+                    for dlength in 1 to vlength loop
+                        for i in 0 to 100 loop
+                            ra := std_ulogic_vector(resize(signed(rnd.RandSlv(dlength * 8)), 64));
+                            rb := std_ulogic_vector(resize(signed(rnd.RandSlv(vlength * 8)), 64));
+
+                            d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
+                            d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
+                            d1.is_signed <= '1';
+                            d1.neg_result <= ra(63) xor rb(63);
+                            d1.is_extended <= '1';
+                            d1.valid <= '1';
+
+                            wait for clk_period;
+
+                            d1.valid <= '0';
+                            for j in 0 to 66 loop
+                                wait for clk_period;
+                                if d2.valid = '1' then
+                                    exit;
+                                end if;
+                            end loop;
+                            check_true(?? d2.valid, result("for valid"));
+
+                            behave_rt := (others => '0');
+                            if rb /= x"0000000000000000" then
+                                d128 := ra & x"0000000000000000";
+                                q128 := std_ulogic_vector(signed(d128) / signed(rb));
+                                if q128(127 downto 63) = x"0000000000000000" & '0' or
+                                    q128(127 downto 63) = x"ffffffffffffffff" & '1' then
+                                    behave_rt := q128(63 downto 0);
+                                end if;
+                            end if;
+                            check_equal(d2.write_reg_data, behave_rt, result("for divde"));
+                        end loop;
                    end loop;
-                    assert d2.valid = '1';
-
-                    behave_rt := (others => '0');
-                    if unsigned(rb) > unsigned(ra) then
-                        d128 := ra & x"0000000000000000";
-                        q128 := std_ulogic_vector(unsigned(d128) / unsigned(rb));
-                        behave_rt := q128(63 downto 0);
-                    end if;
-                    assert to_hstring(behave_rt) = to_hstring(d2.write_reg_data)
-                        report "bad divdeu expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data) & " for ra = " & to_hstring(ra) & " rb = " & to_hstring(rb);
                end loop;
-            end loop;
-        end loop;
-
-        -- test divw
-        report "test divw";
-        divw_loop : for dlength in 1 to 4 loop
-            for vlength in 1 to dlength loop
-                for i in 0 to 100 loop
-                    ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
-                    rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
-
-                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
-                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
-                    d1.is_signed <= '1';
-                    d1.neg_result <= ra(63) xor rb(63);
-                    d1.is_extended <= '0';
-                    d1.is_32bit <= '1';
-                    d1.valid <= '1';
-
-                    wait for clk_period;

-                    d1.valid <= '0';
-                    for j in 0 to 66 loop
-                        wait for clk_period;
-                        if d2.valid = '1' then
-                            exit;
-                        end if;
+            elsif run("Test divdeu") then
+                divdeu_loop : for vlength in 1 to 8 loop
+                    for dlength in 1 to vlength loop
+                        for i in 0 to 100 loop
+                            ra := std_ulogic_vector(resize(unsigned(rnd.RandSlv(dlength * 8)), 64));
+                            rb := std_ulogic_vector(resize(unsigned(rnd.RandSlv(vlength * 8)), 64));
+
+                            d1.dividend <= ra;
+                            d1.divisor <= rb;
+                            d1.is_extended <= '1';
+                            d1.valid <= '1';
+
+                            wait for clk_period;
+
+                            d1.valid <= '0';
+                            for j in 0 to 66 loop
+                                wait for clk_period;
+                                if d2.valid = '1' then
+                                    exit;
+                                end if;
+                            end loop;
+                            check_true(?? d2.valid, result("for valid"));
+
+                            behave_rt := (others => '0');
+                            if unsigned(rb) > unsigned(ra) then
+                                d128 := ra & x"0000000000000000";
+                                q128 := std_ulogic_vector(unsigned(d128) / unsigned(rb));
+                                behave_rt := q128(63 downto 0);
+                            end if;
+                            check_equal(d2.write_reg_data, behave_rt, result("for divdeu"));
+                        end loop;
                    end loop;
-                    assert d2.valid = '1';
-
-                    behave_rt := (others => '0');
-                    if rb /= x"0000000000000000" and (ra /= x"ffffffff80000000" or rb /= x"ffffffffffffffff") then
-                        behave_rt := ppc_divw(ra, rb);
-                    end if;
-                    assert behave_rt = d2.write_reg_data
-                        report "bad divw expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
                end loop;
-            end loop;
-        end loop;
-
-        -- test divwu
-        report "test divwu";
-        divwu_loop : for dlength in 1 to 4 loop
-            for vlength in 1 to dlength loop
-                for i in 0 to 100 loop
-                    ra := std_ulogic_vector(resize(unsigned(pseudorand(dlength * 8)), 64));
-                    rb := std_ulogic_vector(resize(unsigned(pseudorand(vlength * 8)), 64));
-
-                    d1.dividend <= ra;
-                    d1.divisor <= rb;
-                    d1.is_signed <= '0';
-                    d1.neg_result <= '0';
-                    d1.is_extended <= '0';
-                    d1.is_32bit <= '1';
-                    d1.valid <= '1';

-                    wait for clk_period;
-
-                    d1.valid <= '0';
-                    for j in 0 to 66 loop
-                        wait for clk_period;
-                        if d2.valid = '1' then
-                            exit;
-                        end if;
+            elsif run("Test divw") then
+                divw_loop : for dlength in 1 to 4 loop
+                    for vlength in 1 to dlength loop
+                        for i in 0 to 100 loop
+                            ra := std_ulogic_vector(resize(signed(rnd.RandSlv(dlength * 8)), 64));
+                            rb := std_ulogic_vector(resize(signed(rnd.RandSlv(vlength * 8)), 64));
+
+                            d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
+                            d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
+                            d1.is_signed <= '1';
+                            d1.neg_result <= ra(63) xor rb(63);
+                            d1.is_32bit <= '1';
+                            d1.valid <= '1';
+
+                            wait for clk_period;
+
+                            d1.valid <= '0';
+                            for j in 0 to 66 loop
+                                wait for clk_period;
+                                if d2.valid = '1' then
+                                    exit;
+                                end if;
+                            end loop;
+                            check_true(?? d2.valid, result("for valid"));
+
+                            behave_rt := (others => '0');
+                            if rb /= x"0000000000000000" and (ra /= x"ffffffff80000000" or rb /= x"ffffffffffffffff") then
+                                behave_rt := ppc_divw(ra, rb);
+                            end if;
+                            check_equal(d2.write_reg_data, behave_rt, result("for divw"));
+                        end loop;
                    end loop;
-                    assert d2.valid = '1';
-
-                    behave_rt := (others => '0');
-                    if rb /= x"0000000000000000" then
-                        behave_rt := ppc_divwu(ra, rb);
-                    end if;
-                    assert behave_rt = d2.write_reg_data
-                        report "bad divwu expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
                end loop;
-            end loop;
-        end loop;

-        -- test divwe
-        report "test divwe";
-        divwe_loop : for vlength in 1 to 4 loop
-            for dlength in 1 to vlength loop
-                for i in 0 to 100 loop
-                    ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 32)) & x"00000000";
-                    rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
-
-                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
-                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
-                    d1.is_signed <= '1';
-                    d1.neg_result <= ra(63) xor rb(63);
-                    d1.is_extended <= '0';
-                    d1.is_32bit <= '1';
-                    d1.valid <= '1';
-
-                    wait for clk_period;
-
-                    d1.valid <= '0';
-                    for j in 0 to 66 loop
-                        wait for clk_period;
-                        if d2.valid = '1' then
-                            exit;
-                        end if;
+            elsif run("Test divwu") then
+                divwu_loop : for dlength in 1 to 4 loop
+                    for vlength in 1 to dlength loop
+                        for i in 0 to 100 loop
+                            ra := std_ulogic_vector(resize(unsigned(rnd.RandSlv(dlength * 8)), 64));
+                            rb := std_ulogic_vector(resize(unsigned(rnd.RandSlv(vlength * 8)), 64));
+
+                            d1.dividend <= ra;
+                            d1.divisor <= rb;
+                            d1.is_32bit <= '1';
+                            d1.valid <= '1';
+
+                            wait for clk_period;
+
+                            d1.valid <= '0';
+                            for j in 0 to 66 loop
+                                wait for clk_period;
+                                if d2.valid = '1' then
+                                    exit;
+                                end if;
+                            end loop;
+                            check_true(?? d2.valid, result("for valid"));
+
+                            behave_rt := (others => '0');
+                            if rb /= x"0000000000000000" then
+                                behave_rt := ppc_divwu(ra, rb);
+                            end if;
+                            check_equal(d2.write_reg_data, behave_rt, result("for divwu"));
+                        end loop;
                    end loop;
-                    assert d2.valid = '1';
-
-                    behave_rt := (others => '0');
-                    if rb /= x"0000000000000000" then
-                        q64 := std_ulogic_vector(signed(ra) / signed(rb));
-                        if q64(63 downto 31) = x"00000000" & '0' or
-                            q64(63 downto 31) = x"ffffffff" & '1' then
-                            behave_rt := x"00000000" & q64(31 downto 0);
-                        end if;
-                        assert behave_rt = d2.write_reg_data
-                            report "bad divwe expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data) & " for ra = " & to_hstring(ra) & " rb = " & to_hstring(rb);
-                    end if;
                end loop;
-            end loop;
-        end loop;

-        -- test divweu
-        report "test divweu";
-        divweu_loop : for vlength in 1 to 4 loop
-            for dlength in 1 to vlength loop
-                for i in 0 to 100 loop
-                    ra := std_ulogic_vector(resize(unsigned(pseudorand(dlength * 8)), 32)) & x"00000000";
-                    rb := std_ulogic_vector(resize(unsigned(pseudorand(vlength * 8)), 64));
-
-                    d1.dividend <= ra;
-                    d1.divisor <= rb;
-                    d1.is_signed <= '0';
-                    d1.neg_result <= '0';
-                    d1.is_extended <= '0';
-                    d1.is_32bit <= '1';
-                    d1.valid <= '1';
-
-                    wait for clk_period;
-
-                    d1.valid <= '0';
-                    for j in 0 to 66 loop
-                        wait for clk_period;
-                        if d2.valid = '1' then
-                            exit;
-                        end if;
+            elsif run("Test divwe") then
+                divwe_loop : for vlength in 1 to 4 loop
+                    for dlength in 1 to vlength loop
+                        for i in 0 to 100 loop
+                            ra := std_ulogic_vector(resize(signed(rnd.RandSlv(dlength * 8)), 32)) & x"00000000";
+                            rb := std_ulogic_vector(resize(signed(rnd.RandSlv(vlength * 8)), 64));
+
+                            d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
+                            d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
+                            d1.is_signed <= '1';
+                            d1.neg_result <= ra(63) xor rb(63);
+                            d1.is_32bit <= '1';
+                            d1.valid <= '1';
+
+                            wait for clk_period;
+
+                            d1.valid <= '0';
+                            for j in 0 to 66 loop
+                                wait for clk_period;
+                                if d2.valid = '1' then
+                                    exit;
+                                end if;
+                            end loop;
+                            check_true(?? d2.valid, result("for valid"));
+
+                            behave_rt := (others => '0');
+                            if rb /= x"0000000000000000" then
+                                q64 := std_ulogic_vector(signed(ra) / signed(rb));
+                                if q64(63 downto 31) = x"00000000" & '0' or
+                                    q64(63 downto 31) = x"ffffffff" & '1' then
+                                    behave_rt := x"00000000" & q64(31 downto 0);
+                                end if;
+                                check_equal(d2.write_reg_data, behave_rt, result("for divwe"));
+                            end if;
+                        end loop;
                    end loop;
-                    assert d2.valid = '1';
-
-                    behave_rt := (others => '0');
-                    if unsigned(rb(31 downto 0)) > unsigned(ra(63 downto 32)) then
-                        behave_rt := std_ulogic_vector(unsigned(ra) / unsigned(rb));
-                    end if;
-                    assert behave_rt = d2.write_reg_data
-                        report "bad divweu expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data) & " for ra = " & to_hstring(ra) & " rb = " & to_hstring(rb);
                end loop;
-            end loop;
-        end loop;
-
-        -- test modsd
-        report "test modsd";
-        modsd_loop : for dlength in 1 to 8 loop
-            for vlength in 1 to dlength loop
-                for i in 0 to 100 loop
-                    ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
-                    rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
-
-                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
-                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
-                    d1.is_signed <= '1';
-                    d1.neg_result <= ra(63);
-                    d1.is_extended <= '0';
-                    d1.is_32bit <= '0';
-                    d1.is_modulus <= '1';
-                    d1.valid <= '1';

-                    wait for clk_period;
-
-                    d1.valid <= '0';
-                    for j in 0 to 66 loop
-                        wait for clk_period;
-                        if d2.valid = '1' then
-                            exit;
-                        end if;
+            elsif run("Test divweu") then
+                divweu_loop : for vlength in 1 to 4 loop
+                    for dlength in 1 to vlength loop
+                        for i in 0 to 100 loop
+                            ra := std_ulogic_vector(resize(unsigned(rnd.RandSlv(dlength * 8)), 32)) & x"00000000";
+                            rb := std_ulogic_vector(resize(unsigned(rnd.RandSlv(vlength * 8)), 64));
+
+                            d1.dividend <= ra;
+                            d1.divisor <= rb;
+                            d1.is_32bit <= '1';
+                            d1.valid <= '1';
+
+                            wait for clk_period;
+
+                            d1.valid <= '0';
+                            for j in 0 to 66 loop
+                                wait for clk_period;
+                                if d2.valid = '1' then
+                                    exit;
+                                end if;
+                            end loop;
+                            check_true(?? d2.valid, result("for valid"));
+
+                            behave_rt := (others => '0');
+                            if unsigned(rb(31 downto 0)) > unsigned(ra(63 downto 32)) then
+                                behave_rt := std_ulogic_vector(unsigned(ra) / unsigned(rb));
+                            end if;
+                            check_equal(d2.write_reg_data, behave_rt, result("for divweu"));
+                        end loop;
                    end loop;
-                    assert d2.valid = '1';
-
-                    behave_rt := (others => '0');
-                    if rb /= x"0000000000000000" then
-                        behave_rt := std_ulogic_vector(signed(ra) rem signed(rb));
-                    end if;
-                    assert behave_rt = d2.write_reg_data
-                        report "bad modsd expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
                end loop;
-            end loop;
-        end loop;
-
-        -- test modud
-        report "test modud";
-        modud_loop : for dlength in 1 to 8 loop
-            for vlength in 1 to dlength loop
-                for i in 0 to 100 loop
-                    ra := std_ulogic_vector(resize(unsigned(pseudorand(dlength * 8)), 64));
-                    rb := std_ulogic_vector(resize(unsigned(pseudorand(vlength * 8)), 64));
-
-                    d1.dividend <= ra;
-                    d1.divisor <= rb;
-                    d1.is_signed <= '0';
-                    d1.neg_result <= '0';
-                    d1.is_extended <= '0';
-                    d1.is_32bit <= '0';
-                    d1.is_modulus <= '1';
-                    d1.valid <= '1';
-
-                    wait for clk_period;

-                    d1.valid <= '0';
-                    for j in 0 to 66 loop
-                        wait for clk_period;
-                        if d2.valid = '1' then
-                            exit;
-                        end if;
+            elsif run("Test modsd") then
+                modsd_loop : for dlength in 1 to 8 loop
+                    for vlength in 1 to dlength loop
+                        for i in 0 to 100 loop
+                            ra := std_ulogic_vector(resize(signed(rnd.RandSlv(dlength * 8)), 64));
+                            rb := std_ulogic_vector(resize(signed(rnd.RandSlv(vlength * 8)), 64));
+
+                            d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
+                            d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
+                            d1.is_signed <= '1';
+                            d1.neg_result <= ra(63);
+                            d1.is_modulus <= '1';
+                            d1.valid <= '1';
+
+                            wait for clk_period;
+
+                            d1.valid <= '0';
+                            for j in 0 to 66 loop
+                                wait for clk_period;
+                                if d2.valid = '1' then
+                                    exit;
+                                end if;
+                            end loop;
+                            check_true(?? d2.valid, result("for valid"));
+
+                            behave_rt := (others => '0');
+                            if rb /= x"0000000000000000" then
+                                behave_rt := std_ulogic_vector(signed(ra) rem signed(rb));
+                            end if;
+                            check_equal(d2.write_reg_data, behave_rt, result("for modsd"));
+                        end loop;
                    end loop;
-                    assert d2.valid = '1';
-
-                    behave_rt := (others => '0');
-                    if rb /= x"0000000000000000" then
-                        behave_rt := std_ulogic_vector(unsigned(ra) rem unsigned(rb));
-                    end if;
-                    assert behave_rt = d2.write_reg_data
-                        report "bad modud expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
                end loop;
-            end loop;
-        end loop;
-
-        -- test modsw
-        report "test modsw";
-        modsw_loop : for dlength in 1 to 4 loop
-            for vlength in 1 to dlength loop
-                for i in 0 to 100 loop
-                    ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
-                    rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
-
-                    d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
-                    d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
-                    d1.is_signed <= '1';
-                    d1.neg_result <= ra(63);
-                    d1.is_extended <= '0';
-                    d1.is_32bit <= '1';
-                    d1.is_modulus <= '1';
-                    d1.valid <= '1';
-
-                    wait for clk_period;

-                    d1.valid <= '0';
-                    for j in 0 to 66 loop
-                        wait for clk_period;
-                        if d2.valid = '1' then
-                            exit;
-                        end if;
+            elsif run("Test modud") then
+                modud_loop : for dlength in 1 to 8 loop
+                    for vlength in 1 to dlength loop
+                        for i in 0 to 100 loop
+                            ra := std_ulogic_vector(resize(unsigned(rnd.RandSlv(dlength * 8)), 64));
+                            rb := std_ulogic_vector(resize(unsigned(rnd.RandSlv(vlength * 8)), 64));
+
+                            d1.dividend <= ra;
+                            d1.divisor <= rb;
+                            d1.is_modulus <= '1';
+                            d1.valid <= '1';
+
+                            wait for clk_period;
+
+                            d1.valid <= '0';
+                            for j in 0 to 66 loop
+                                wait for clk_period;
+                                if d2.valid = '1' then
+                                    exit;
+                                end if;
+                            end loop;
+                            check_true(?? d2.valid, result("for valid"));
+
+                            behave_rt := (others => '0');
+                            if rb /= x"0000000000000000" then
+                                behave_rt := std_ulogic_vector(unsigned(ra) rem unsigned(rb));
+                            end if;
+                            check_equal(d2.write_reg_data, behave_rt, result("for modud"));
+                        end loop;
                    end loop;
-                    assert d2.valid = '1';
-
-                    behave_rt := (others => '0');
-                    if rb /= x"0000000000000000" then
-                        rem32 := std_ulogic_vector(signed(ra(31 downto 0)) rem signed(rb(31 downto 0)));
-                        if rem32(31) = '0' then
-                            behave_rt := x"00000000" & rem32;
-                        else
-                            behave_rt := x"ffffffff" & rem32;
-                        end if;
-                    end if;
-                    assert behave_rt = d2.write_reg_data
-                        report "bad modsw expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
                end loop;
-            end loop;
-        end loop;
-
-        -- test moduw
-        report "test moduw";
-        moduw_loop : for dlength in 1 to 4 loop
-            for vlength in 1 to dlength loop
-                for i in 0 to 100 loop
-                    ra := std_ulogic_vector(resize(unsigned(pseudorand(dlength * 8)), 64));
-                    rb := std_ulogic_vector(resize(unsigned(pseudorand(vlength * 8)), 64));
-
-                    d1.dividend <= ra;
-                    d1.divisor <= rb;
-                    d1.is_signed <= '0';
-                    d1.neg_result <= '0';
-                    d1.is_extended <= '0';
-                    d1.is_32bit <= '1';
-                    d1.is_modulus <= '1';
-                    d1.valid <= '1';
-
-                    wait for clk_period;

-                    d1.valid <= '0';
-                    for j in 0 to 66 loop
-                        wait for clk_period;
-                        if d2.valid = '1' then
-                            exit;
-                        end if;
+            elsif run("Test modsw") then
+                modsw_loop : for dlength in 1 to 4 loop
+                    for vlength in 1 to dlength loop
+                        for i in 0 to 100 loop
+                            ra := std_ulogic_vector(resize(signed(rnd.RandSlv(dlength * 8)), 64));
+                            rb := std_ulogic_vector(resize(signed(rnd.RandSlv(vlength * 8)), 64));
+
+                            d1.dividend <= ra when ra(63) = '0' else std_ulogic_vector(- signed(ra));
+                            d1.divisor <= rb when rb(63) = '0' else std_ulogic_vector(- signed(rb));
+                            d1.is_signed <= '1';
+                            d1.neg_result <= ra(63);
+                            d1.is_32bit <= '1';
+                            d1.is_modulus <= '1';
+                            d1.valid <= '1';
+
+                            wait for clk_period;
+
+                            d1.valid <= '0';
+                            for j in 0 to 66 loop
+                                wait for clk_period;
+                                if d2.valid = '1' then
+                                    exit;
+                                end if;
+                            end loop;
+                            check_true(?? d2.valid, result("for valid"));
+
+                            behave_rt := (others => '0');
+                            if rb /= x"0000000000000000" then
+                                rem32 := std_ulogic_vector(signed(ra(31 downto 0)) rem signed(rb(31 downto 0)));
+                                if rem32(31) = '0' then
+                                    behave_rt := x"00000000" & rem32;
+                                else
+                                    behave_rt := x"ffffffff" & rem32;
+                                end if;
+                            end if;
+                            check_equal(d2.write_reg_data, behave_rt, result("for modsw"));
+                        end loop;
                    end loop;
-                    assert d2.valid = '1';
+                end loop;

-                    behave_rt := (others => '0');
-                    if rb /= x"0000000000000000" then
-                        behave_rt := x"00000000" & std_ulogic_vector(unsigned(ra(31 downto 0)) rem unsigned(rb(31 downto 0)));
-                    end if;
-                    assert behave_rt(31 downto 0) = d2.write_reg_data(31 downto 0)
-                        report "bad moduw expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
+            elsif run("Test moduw") then
+                moduw_loop : for dlength in 1 to 4 loop
+                    for vlength in 1 to dlength loop
+                        for i in 0 to 100 loop
+                            ra := std_ulogic_vector(resize(unsigned(rnd.RandSlv(dlength * 8)), 64));
+                            rb := std_ulogic_vector(resize(unsigned(rnd.RandSlv(vlength * 8)), 64));
+
+                            d1.dividend <= ra;
+                            d1.divisor <= rb;
+                            d1.is_32bit <= '1';
+                            d1.is_modulus <= '1';
+                            d1.valid <= '1';
+
+                            wait for clk_period;
+
+                            d1.valid <= '0';
+                            for j in 0 to 66 loop
+                                wait for clk_period;
+                                if d2.valid = '1' then
+                                    exit;
+                                end if;
+                            end loop;
+                            check_true(?? d2.valid, result("for valid"));
+
+                            behave_rt := (others => '0');
+                            if rb /= x"0000000000000000" then
+                                behave_rt := x"00000000" & std_ulogic_vector(unsigned(ra(31 downto 0)) rem unsigned(rb(31 downto 0)));
+                            end if;
+                            check_equal(d2.write_reg_data(31 downto 0), behave_rt(31 downto 0), result("for moduw"));
+                        end loop;
+                    end loop;
                end loop;
-            end loop;
+            end if;
        end loop;

-        std.env.finish;
+        test_runner_cleanup(runner);
    end process;
 end behave;
--- a/dmi_dtm_ecp5.vhdl
+++ b/dmi_dtm_ecp5.vhdl
@ -0,0 +1,298 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.math_real.all;
+
+library work;
+use work.wishbone_types.all;
+
+entity dmi_dtm is
+    generic(ABITS : INTEGER:=8;
+            DBITS : INTEGER:=64);
+
+    port(sys_clk   : in std_ulogic;
+         sys_reset : in std_ulogic;
+         dmi_addr  : out std_ulogic_vector(ABITS - 1 downto 0);
+         dmi_din   : in std_ulogic_vector(DBITS - 1 downto 0);
+         dmi_dout  : out std_ulogic_vector(DBITS - 1 downto 0);
+         dmi_req   : out std_ulogic;
+         dmi_wr    : out std_ulogic;
+         dmi_ack   : in std_ulogic
+--         dmi_err : in std_ulogic TODO: Add error response
+         );
+end entity dmi_dtm;
+
+architecture behaviour of dmi_dtm is
+    -- Signals coming out of the JTAGG block
+    signal jtag_reset_n : std_ulogic;
+    signal tdi        : std_ulogic;
+    signal tdo        : std_ulogic;
+    signal tck        : std_ulogic;
+    signal jce1       : std_ulogic;
+    signal jshift     : std_ulogic;
+    signal update     : std_ulogic;
+
+    -- signals to match dmi_dtb_xilinx
+    signal jtag_reset : std_ulogic;
+    signal capture    : std_ulogic;
+    signal jtag_clk   : std_ulogic;
+    signal sel        : std_ulogic;
+    signal shift      : std_ulogic;
+
+    -- delays
+    signal jce1_d     : std_ulogic;
+    constant TCK_DELAY : INTEGER := 8;
+    signal tck_d : std_ulogic_vector(TCK_DELAY+1 downto 1);
+
+    -- ** JTAG clock domain **
+
+    -- Shift register
+    signal shiftr : std_ulogic_vector(ABITS + DBITS + 1 downto 0);
+
+    -- Latched request
+    signal request : std_ulogic_vector(ABITS + DBITS + 1 downto 0);
+
+    -- A request is present
+    signal jtag_req : std_ulogic;
+
+    -- Synchronizer for jtag_rsp (sys clk -> jtag_clk)
+    signal dmi_ack_0 : std_ulogic;
+    signal dmi_ack_1 : std_ulogic;
+
+    -- ** sys clock domain **
+
+    -- Synchronizer for jtag_req (jtag clk -> sys clk)
+    signal jtag_req_0 : std_ulogic;
+    signal jtag_req_1 : std_ulogic;
+
+    -- ** combination signals
+    signal jtag_bsy : std_ulogic;
+    signal op_valid : std_ulogic;
+    signal rsp_op   : std_ulogic_vector(1 downto 0);
+
+    -- ** Constants **
+    constant DMI_REQ_NOP : std_ulogic_vector(1 downto 0) := "00";
+    constant DMI_REQ_RD  : std_ulogic_vector(1 downto 0) := "01";
+    constant DMI_REQ_WR  : std_ulogic_vector(1 downto 0) := "10";
+    constant DMI_RSP_OK  : std_ulogic_vector(1 downto 0) := "00";
+    constant DMI_RSP_BSY : std_ulogic_vector(1 downto 0) := "11";
+
+    attribute ASYNC_REG : string;
+    attribute ASYNC_REG of jtag_req_0: signal is "TRUE";
+    attribute ASYNC_REG of jtag_req_1: signal is "TRUE";
+    attribute ASYNC_REG of dmi_ack_0: signal is "TRUE";
+    attribute ASYNC_REG of dmi_ack_1: signal is "TRUE";
+
+    -- ECP5 JTAGG
+    component JTAGG is
+        generic (
+            ER1 : string := "ENABLED";
+            ER2 : string := "ENABLED"
+        );
+        port(
+            JTDO1 : in std_ulogic;
+            JTDO2 : in std_ulogic;
+            JTDI : out std_ulogic;
+            JTCK : out std_ulogic;
+            JRTI1 : out std_ulogic;
+            JRTI2 : out std_ulogic;
+            JSHIFT : out std_ulogic;
+            JUPDATE : out std_ulogic;
+            JRSTN : out std_ulogic;
+            JCE1 : out std_ulogic;
+            JCE2 : out std_ulogic
+        );
+    end component;
+
+    component LUT4 is
+        generic (
+            INIT : std_logic_vector
+        );
+        port(
+          A : in STD_ULOGIC;
+          B : in STD_ULOGIC;
+          C : in STD_ULOGIC;
+          D : in STD_ULOGIC;
+          Z : out STD_ULOGIC
+        );
+    end component;
+
+begin
+
+    jtag: JTAGG
+        generic map(
+            ER2 => "DISABLED"
+        )
+        port map (
+            JTDO1 => tdo,
+            JTDO2 => '0',
+            JTDI => tdi,
+            JTCK => tck,
+            JRTI1 => open,
+            JRTI2 => open,
+            JSHIFT => jshift,
+            JUPDATE => update,
+            JRSTN => jtag_reset_n,
+            JCE1 => jce1,
+            JCE2 => open
+        );
+
+    -- JRTI1 looks like it could be connected to SEL, but
+    -- in practise JRTI1 is only high briefly, not for the duration
+    -- of the transmission. possibly mw_debug could be modified.
+    -- The ecp5 is probably the only jtag device anyway.
+    sel <= '1';
+
+    -- TDI needs to align with TCK, we use LUT delays here.
+    -- From https://github.com/enjoy-digital/litex/pull/1087
+    tck_d(1) <= tck;
+    del: for i in 1 to TCK_DELAY generate
+        attribute keep : boolean;
+        attribute keep of l: label is true;
+    begin
+        l: LUT4
+            generic map(
+                INIT => b"0000_0000_0000_0010"
+            )
+            port map (
+                A => tck_d(i),
+                B => '0', C => '0', D => '0',
+                Z => tck_d(i+1)
+            );
+    end generate;
+    jtag_clk <= tck_d(TCK_DELAY+1);
+
+    -- capture signal
+    jce1_sync : process(jtag_clk)
+    begin
+        if rising_edge(jtag_clk) then
+            jce1_d <= jce1;
+            capture <= jce1 and not jce1_d;
+        end if;
+    end process;
+
+    -- latch the shift signal, otherwise
+    -- we miss the last shift in
+    -- (maybe because we are delaying tck?)
+    shift_sync : process(jtag_clk)
+    begin
+        if (sys_reset = '1') then
+            shift <= '0';
+        elsif rising_edge(jtag_clk) then
+            shift <= jshift;
+        end if;
+    end process;
+
+    jtag_reset <= not jtag_reset_n;
+
+    -- dmi_req synchronization
+    dmi_req_sync : process(sys_clk)
+    begin
+        -- sys_reset is synchronous
+        if rising_edge(sys_clk) then
+            if (sys_reset = '1') then
+                jtag_req_0 <= '0';
+                jtag_req_1 <= '0';
+            else
+                jtag_req_0 <= jtag_req;
+                jtag_req_1 <= jtag_req_0;
+            end if;
+        end if;
+    end process;
+    dmi_req <= jtag_req_1;
+
+    -- dmi_ack synchronization
+    dmi_ack_sync: process(jtag_clk, jtag_reset)
+    begin
+        -- jtag_reset is async (see comments)
+        if jtag_reset = '1' then
+            dmi_ack_0 <= '0';
+            dmi_ack_1 <= '0';
+        elsif rising_edge(jtag_clk) then
+            dmi_ack_0 <= dmi_ack;
+            dmi_ack_1 <= dmi_ack_0;
+        end if;
+    end process;
+   
+    -- jtag_bsy indicates whether we can start a new request, we can when
+    -- we aren't already processing one (jtag_req) and the synchronized ack
+    -- of the previous one is 0.
+    --
+    jtag_bsy <= jtag_req or dmi_ack_1;
+
+    -- decode request type in shift register
+    with shiftr(1 downto 0) select op_valid <=
+        '1' when DMI_REQ_RD,
+        '1' when DMI_REQ_WR,
+        '0' when others;
+
+    -- encode response op
+    rsp_op <= DMI_RSP_BSY when jtag_bsy = '1' else DMI_RSP_OK;
+
+    -- Some DMI out signals are directly driven from the request register
+    dmi_addr <= request(ABITS + DBITS + 1 downto DBITS + 2);
+    dmi_dout <= request(DBITS + 1 downto 2);
+    dmi_wr   <= '1' when request(1 downto 0) = DMI_REQ_WR else '0';
+
+    -- TDO is wired to shift register bit 0
+    tdo <= shiftr(0);
+
+    -- Main state machine. Handles shift registers, request latch and
+    -- jtag_req latch. Could be split into 3 processes but it's probably
+    -- not worthwhile.
+    --
+    shifter: process(jtag_clk, jtag_reset, sys_reset)
+    begin
+        if jtag_reset = '1' or sys_reset = '1' then
+            shiftr <= (others => '0');
+            jtag_req <= '0';
+            request <= (others => '0');
+        elsif rising_edge(jtag_clk) then
+
+            -- Handle jtag "commands" when sel is 1
+            if sel = '1' then
+                -- Shift state, rotate the register
+                if shift = '1' then
+                    shiftr <= tdi & shiftr(ABITS + DBITS + 1 downto 1);
+                end if;
+
+                -- Update state (trigger)
+                --
+                -- Latch the request if we aren't already processing one and
+                -- it has a valid command opcode.
+                --
+                    if update = '1' and op_valid = '1' then
+                    if jtag_bsy = '0' then
+                        request <= shiftr;
+                        jtag_req <= '1';
+                    end if;
+                    -- Set the shift register "op" to "busy". This will prevent
+                    -- us from re-starting the command on the next update if
+                    -- the command completes before that.
+                    shiftr(1 downto 0) <= DMI_RSP_BSY;
+                end if;
+
+                -- Request completion.
+                --
+                -- Capture the response data for reads and clear request flag.
+                --
+                -- Note: We clear req (and thus dmi_req) here which relies on tck
+                -- ticking and sel set. This means we are stuck with dmi_req up if
+                -- the jtag interface stops. Slaves must be resilient to this.
+                --
+                if jtag_req = '1' and dmi_ack_1 = '1' then
+                    jtag_req <= '0';
+                    if request(1 downto 0) = DMI_REQ_RD then
+                        request(DBITS + 1 downto 2) <= dmi_din;
+                    end if;
+                end if;
+
+                -- Capture state, grab latch content with updated status
+                if capture = '1' then
+                    shiftr <= request(ABITS + DBITS + 1 downto 2) & rsp_op;
+                end if;
+
+            end if;
+        end if;
+    end process;
+end architecture behaviour;
+
--- a/dmi_dtm_jtag.vhdl
+++ b/dmi_dtm_jtag.vhdl
@ -1,301 +0,0 @@
-- JTAG to DMI interface, based on the Xilinx version
--
-- DMI bus
--
--  req : ____/------------\_____
--  addr: xxxx<            >xxxxx, based on the Xilinx version
--  dout: xxxx<            >xxxxx
--  wr  : xxxx<            >xxxxx
--  din : xxxxxxxxxxxx<      >xxx
--  ack : ____________/------\___
--
--  * addr/dout set along with req, can be latched on same cycle by slave
--  * ack & din remain up until req is dropped by master, the slave must
--    provide a stable output on din on reads during that time.
--  * req remains low at until at least one sysclk after ack seen down.
--
--   JTAG (tck)                    DMI (sys_clk)
--
--   * jtag_req = 1
--        (jtag_req_0)             *
--          (jtag_req_1) ->        * dmi_req = 1 >
--                                 *.../...
--                                 * dmi_ack = 1 <
--   *                         (dmi_ack_0)
--   *                   <-  (dmi_ack_1)
--   * jtag_req = 0 (and latch dmi_din)
--        (jtag_req_0)             *
--          (jtag_req_1) ->        * dmi_req = 0 >
--                                 * dmi_ack = 0 <
--  *                          (dmi_ack_0)
--  *                    <-  (dmi_ack_1)
--
--  jtag_req can go back to 1 when jtag_rsp_1 is 0
--
--  Questions/TODO:
--    - I use 2 flip fops for sync, is that enough ?
--    - I treat the jtag_trst as an async reset, is that necessary ?
--    - Dbl check reset situation since we have two different resets
--      each only resetting part of the logic...
--    - Look at optionally removing the synchronizer on the ack path,
--      assuming JTAG is always slow enough that ack will have been
--      stable long enough by the time CAPTURE comes in.
--    - We could avoid the latched request by not shifting while a
--      request is in progress (and force TDO to 1 to return a busy
--      status).
--
--  WARNING: This isn't the real DMI JTAG protocol (at least not yet).
--           a command while busy will be ignored. A response of "11"
--           means the previous command is still going, try again.
--           As such We don't implement the DMI "error" status, and
--           we don't implement DTMCS yet... This may still all change
--           but for now it's easier that way as the real DMI protocol
--           requires for a command to work properly that enough TCK
--           are sent while IDLE and I'm having trouble getting that
--           working with UrJtag and the Xilinx BSCAN2 for now.
-
-library ieee;
-use ieee.std_logic_1164.all;
-use ieee.math_real.all;
-
-library work;
-use work.wishbone_types.all;
-
-entity dmi_dtm_jtag is
-    generic(ABITS : INTEGER:=8;
-	    DBITS : INTEGER:=32);
-
-    port(sys_clk	: in std_ulogic;
-	 sys_reset	: in std_ulogic;
-	 dmi_addr	: out std_ulogic_vector(ABITS - 1 downto 0);
-	 dmi_din	: in std_ulogic_vector(DBITS - 1 downto 0);
-	 dmi_dout	: out std_ulogic_vector(DBITS - 1 downto 0);
-	 dmi_req	: out std_ulogic;
-	 dmi_wr		: out std_ulogic;
-	 dmi_ack	: in std_ulogic;
--	 dmi_err	: in std_ulogic TODO: Add error response
-         jtag_tck       : in std_ulogic;
-         jtag_tdi       : in std_ulogic;
-         jtag_tms       : in std_ulogic;
-	 jtag_trst      : in std_ulogic;
-         jtag_tdo       : out std_ulogic
-	 );
-end entity dmi_dtm_jtag;
-
-architecture behaviour of dmi_dtm_jtag is
-
-    -- Signals coming out of the JTAG TAP controller
-    signal capture		: std_ulogic;
-    signal update		: std_ulogic;
-    signal sel			: std_ulogic;
-    signal shift		: std_ulogic;
-    signal tdi			: std_ulogic;
-    signal tdo			: std_ulogic;
-
-    -- ** JTAG clock domain **
-
-    -- Shift register
-    signal shiftr	: std_ulogic_vector(ABITS + DBITS + 1 downto 0);
-
-    -- Latched request
-    signal request	: std_ulogic_vector(ABITS + DBITS + 1 downto 0);
-
-    -- A request is present
-    signal jtag_req	: std_ulogic;
-
-    -- Synchronizer for jtag_rsp (sys clk -> jtag_tck)
-    signal dmi_ack_0	: std_ulogic;
-    signal dmi_ack_1	: std_ulogic;
-
-    -- ** sys clock domain **
-
-    -- Synchronizer for jtag_req (jtag clk -> sys clk)
-    signal jtag_req_0	: std_ulogic;
-    signal jtag_req_1	: std_ulogic;
-
-    -- ** combination signals
-    signal jtag_bsy	: std_ulogic;
-    signal op_valid	: std_ulogic;
-    signal rsp_op	: std_ulogic_vector(1 downto 0);
-
-    -- ** Constants **
-    constant DMI_REQ_NOP : std_ulogic_vector(1 downto 0) := "00";
-    constant DMI_REQ_RD  : std_ulogic_vector(1 downto 0) := "01";
-    constant DMI_REQ_WR  : std_ulogic_vector(1 downto 0) := "10";
-    constant DMI_RSP_OK  : std_ulogic_vector(1 downto 0) := "00";
-    constant DMI_RSP_BSY : std_ulogic_vector(1 downto 0) := "11";
-
-    attribute ASYNC_REG : string;
-    attribute ASYNC_REG of jtag_req_0: signal is "TRUE";
-    attribute ASYNC_REG of jtag_req_1: signal is "TRUE";
-    attribute ASYNC_REG of dmi_ack_0: signal is "TRUE";
-    attribute ASYNC_REG of dmi_ack_1: signal is "TRUE";
-
-    component tap_top port (
-        -- JTAG pads
-        tms_pad_i : in std_ulogic;
-        tck_pad_i : in std_ulogic;
-        trst_pad_i : in std_ulogic;
-        tdi_pad_i : in std_ulogic;
-        tdo_pad_o : out std_ulogic;
-        tdo_padoe_o : out std_ulogic;
-
-        -- TAP states
-        shift_dr_o : out std_ulogic;
-        pause_dr_o : out std_ulogic;
-        update_dr_o : out std_ulogic;
-        capture_dr_o : out std_ulogic;
-
-        -- Select signals for boundary scan or mbist
-        extest_select_o : out std_ulogic;
-        sample_preload_select_o : out std_ulogic;
-        mbist_select_o : out std_ulogic;
-        debug_select_o : out std_ulogic;
-
-        -- TDO signal that is connected to TDI of sub-modules.
-        tdo_o : out std_ulogic;
-
-        -- TDI signals from sub-modules
-        debug_tdi_i : in std_ulogic;
-        bs_chain_tdi_i : in std_ulogic;
-        mbist_tdi_i : in std_ulogic
-        );
-    end component;
-
-begin
-    tap_top0 : tap_top
-	port map (
-	    tms_pad_i               => jtag_tms,
-	    tck_pad_i               => jtag_tck,
-	    trst_pad_i              => jtag_trst,
-	    tdi_pad_i               => jtag_tdi,
-	    tdo_pad_o               => jtag_tdo,
-	    tdo_padoe_o             => open,      -- what to do with this?
-
-	    shift_dr_o              => shift,
-	    pause_dr_o              => open,      -- what to do with this?
-	    update_dr_o             => update,
-	    capture_dr_o            => capture,
-
-	    -- connect boundary scan and mbist?
-	    extest_select_o         => open,
-	    sample_preload_select_o => open,
-	    mbist_select_o          => open,
-	    debug_select_o          => sel,
-
-	    tdo_o                   => tdi,
-            debug_tdi_i             => tdo,
-	    bs_chain_tdi_i          => '0',
-            mbist_tdi_i             => '0'
-	    );
-
-    -- dmi_req synchronization
-    dmi_req_sync : process(sys_clk)
-    begin
-	-- sys_reset is synchronous
-	if rising_edge(sys_clk) then
-	    if (sys_reset = '1') then
-		jtag_req_0 <= '0';
-		jtag_req_1 <= '0';
-	    else
-		jtag_req_0 <= jtag_req;
-		jtag_req_1 <= jtag_req_0;
-	    end if;
-	end if;
-    end process;
-    dmi_req <= jtag_req_1;
-
-    -- dmi_ack synchronization
-    dmi_ack_sync: process(jtag_tck, jtag_trst)
-    begin
-	-- jtag_trst is async (see comments)
-	if jtag_trst = '1' then
-	    dmi_ack_0 <= '0';
-	    dmi_ack_1 <= '0';
-	elsif rising_edge(jtag_tck) then
-	    dmi_ack_0 <= dmi_ack;
-	    dmi_ack_1 <= dmi_ack_0;
-	end if;
-    end process;
-
-    -- jtag_bsy indicates whether we can start a new request, we can when
-    -- we aren't already processing one (jtag_req) and the synchronized ack
-    -- of the previous one is 0.
-    --
-    jtag_bsy <= jtag_req or dmi_ack_1;
-
-    -- decode request type in shift register
-    with shiftr(1 downto 0) select op_valid <=
-	'1' when DMI_REQ_RD,
-	'1' when DMI_REQ_WR,
-	'0' when others;
-
-    -- encode response op
-    rsp_op <= DMI_RSP_BSY when jtag_bsy = '1' else DMI_RSP_OK;
-
-    -- Some DMI out signals are directly driven from the request register
-    dmi_addr <= request(ABITS + DBITS + 1 downto DBITS + 2);
-    dmi_dout <= request(DBITS + 1 downto 2);
-    dmi_wr   <= '1' when request(1 downto 0) = DMI_REQ_WR else '0';
-
-    -- TDO is wired to shift register bit 0
-    tdo <= shiftr(0);
-
-    -- Main state machine. Handles shift registers, request latch and
-    -- jtag_req latch. Could be split into 3 processes but it's probably
-    -- not worthwhile.
-    --
-    shifter: process(jtag_tck, jtag_trst)
-    begin
-	if jtag_trst = '1' then
-	    shiftr <= (others => '0');
-	    jtag_req <= '0';
-	elsif rising_edge(jtag_tck) then
-
-	    -- Handle jtag "commands" when sel is 1
-	    if sel = '1' then
-		-- Shift state, rotate the register
-		if shift = '1' then
-		    shiftr <= tdi & shiftr(ABITS + DBITS + 1 downto 1);
-		end if;
-
-		-- Update state (trigger)
-		--
-		-- Latch the request if we aren't already processing one and
-		-- it has a valid command opcode.
-		--
-	    	if update = '1' and op_valid = '1' then
-		    if jtag_bsy = '0' then
-			request <= shiftr;
-			jtag_req <= '1';
-		    end if;
-		    -- Set the shift register "op" to "busy". This will prevent
-		    -- us from re-starting the command on the next update if
-		    -- the command completes before that.
-		    shiftr(1 downto 0) <= DMI_RSP_BSY;
-		end if;
-
-		-- Request completion.
-		--
-		-- Capture the response data for reads and clear request flag.
-		--
-		-- Note: We clear req (and thus dmi_req) here which relies on tck
-		-- ticking and sel set. This means we are stuck with dmi_req up if
-		-- the jtag interface stops. Slaves must be resilient to this.
-		--
-		if jtag_req = '1' and dmi_ack_1 = '1' then
-		    jtag_req <= '0';
-		    if request(1 downto 0) = DMI_REQ_RD then
-			request(DBITS + 1 downto 2) <= dmi_din;
-		    end if;
-		end if;
-
-		-- Capture state, grab latch content with updated status
-		if capture = '1' then
-		    shiftr <= request(ABITS + DBITS + 1 downto 2) & rsp_op;
-		end if;
-
-	    end if;
-	end if;
-    end process;
-end architecture behaviour;
--- a/dmi_dtm_tb.vhdl
+++ b/dmi_dtm_tb.vhdl
@ -19,12 +19,12 @@ architecture behave of dmi_dtm_tb is
    constant jclk_period : time := 30 ns;

    -- DMI debug bus signals
-    signal dmi_addr	: std_ulogic_vector(7 downto 0);
-    signal dmi_din	: std_ulogic_vector(63 downto 0);
-    signal dmi_dout	: std_ulogic_vector(63 downto 0);
-    signal dmi_req	: std_ulogic;
-    signal dmi_wr	: std_ulogic;
-    signal dmi_ack	: std_ulogic;
+    signal dmi_addr : std_ulogic_vector(7 downto 0);
+    signal dmi_din  : std_ulogic_vector(63 downto 0);
+    signal dmi_dout : std_ulogic_vector(63 downto 0);
+    signal dmi_req  : std_ulogic;
+    signal dmi_wr   : std_ulogic;
+    signal dmi_ack  : std_ulogic;

    -- Global JTAG signals (used by BSCANE2 inside dmi_dtm
    alias j : glob_jtag_t is glob_jtag;
@ -35,216 +35,216 @@ architecture behave of dmi_dtm_tb is

 begin
    dtm: entity work.dmi_dtm
-	generic map(
-	    ABITS => 8,
-	    DBITS => 64
-	    )
-	port map(
-	    sys_clk	=> clk,
-	    sys_reset	=> rst,
-	    dmi_addr	=> dmi_addr,
-	    dmi_din	=> dmi_din,
-	    dmi_dout	=> dmi_dout,
-	    dmi_req	=> dmi_req,
-	    dmi_wr	=> dmi_wr,
-	    dmi_ack	=> dmi_ack
-	    );
+        generic map(
+            ABITS => 8,
+            DBITS => 64
+            )
+        port map(
+            sys_clk   => clk,
+            sys_reset => rst,
+            dmi_addr  => dmi_addr,
+            dmi_din   => dmi_din,
+            dmi_dout  => dmi_dout,
+            dmi_req   => dmi_req,
+            dmi_wr    => dmi_wr,
+            dmi_ack   => dmi_ack
+            );

    simple_ram_0: entity work.wishbone_bram_wrapper
-	generic map(RAM_INIT_FILE => "main_ram.bin",
-		    MEMORY_SIZE => 524288)
-	port map(clk => clk, rst => rst,
-		 wishbone_in => wishbone_ram_out,
-		 wishbone_out => wishbone_ram_in);
+        generic map(RAM_INIT_FILE => "main_ram.bin",
+                    MEMORY_SIZE => 524288)
+        port map(clk => clk, rst => rst,
+                 wishbone_in => wishbone_ram_out,
+                 wishbone_out => wishbone_ram_in);

    wishbone_debug_0: entity work.wishbone_debug_master
-	port map(clk => clk, rst => rst,
-		 dmi_addr => dmi_addr(1 downto 0),
-		 dmi_dout => dmi_din,
-		 dmi_din => dmi_dout,
-		 dmi_wr => dmi_wr,
-		 dmi_ack => dmi_ack,
-		 dmi_req => dmi_req,
-		 wb_in => wishbone_ram_in,
-		 wb_out => wishbone_ram_out);
+        port map(clk => clk, rst => rst,
+                 dmi_addr => dmi_addr(1 downto 0),
+                 dmi_dout => dmi_din,
+                 dmi_din => dmi_dout,
+                 dmi_wr => dmi_wr,
+                 dmi_ack => dmi_ack,
+                 dmi_req => dmi_req,
+                 wb_in => wishbone_ram_in,
+                 wb_out => wishbone_ram_out);

    -- system clock
    sys_clk: process
    begin
-	clk <= '1';
-	wait for clk_period / 2;
-	clk <= '0';
-	wait for clk_period / 2;
+        clk <= '1';
+        wait for clk_period / 2;
+        clk <= '0';
+        wait for clk_period / 2;
    end process sys_clk;

    -- system sim: just reset and wait
    sys_sim: process
    begin
-	rst <= '1';
-	wait for clk_period;
-	rst <= '0';
-	wait;
+        rst <= '1';
+        wait for clk_period;
+        rst <= '0';
+        wait;
    end process;

    -- jtag sim process
    sim_jtag: process
-	procedure clock(count: in INTEGER) is
-	begin
-	    for i in 1 to count loop
-		j.tck <= '0';
-		wait for jclk_period/2;
-		j.tck <= '1';
-		wait for jclk_period/2;
-	    end loop;
-	end procedure clock;
-
-	procedure shift_out(val: in std_ulogic_vector) is
-	begin
-	    for i in 0 to val'length-1 loop
-		j.tdi <= val(i);
-		clock(1);
-	    end loop;
-	end procedure shift_out;
-
-	procedure shift_in(val: out std_ulogic_vector) is
-	begin
-	    for i in val'length-1 downto 0 loop
-		val := j.tdo & val(val'length-1 downto 1);
-		clock(1);
-	    end loop;
-	end procedure shift_in;
-
-	procedure send_command(
-	    addr : in std_ulogic_vector(7 downto 0);
-	    data : in std_ulogic_vector(63 downto 0);
-	    op   : in std_ulogic_vector(1 downto 0)) is
-	begin
-	    j.capture <= '1';
-	    clock(1);
-	    j.capture <= '0';	
-	    clock(1);
-	    j.shift <= '1';
-	    shift_out(op);
-	    shift_out(data);
-	    shift_out(addr);
-	    j.shift <= '0';
-	    j.update <= '1';
-	    clock(1);
-	    j.update <= '0';
-	    clock(1);
-	end procedure send_command;	
-
-	procedure read_resp(
-	    op   : out std_ulogic_vector(1 downto 0);
-	    data : out std_ulogic_vector(63 downto 0)) is
-
-	    variable addr : std_ulogic_vector(7 downto 0);
-	begin
-	    j.capture <= '1';
-	    clock(1);
-	    j.capture <= '0';	
-	    clock(1);
-	    j.shift <= '1';
-	    shift_in(op);
-	    shift_in(data);
-	    shift_in(addr);
-	    j.shift <= '0';
-	    j.update <= '1';
-	    clock(1);
-	    j.update <= '0';
-	    clock(1);
-	end procedure read_resp;	
-
-	procedure dmi_write(addr : in std_ulogic_vector(7 downto 0);
-			    data : in std_ulogic_vector(63 downto 0)) is
-	    variable resp_op   : std_ulogic_vector(1 downto 0);
-	    variable resp_data : std_ulogic_vector(63 downto 0);
-	    variable timeout   : integer;
-	begin
-	    send_command(addr, data, "10");
-	    loop
-		read_resp(resp_op, resp_data);
-		case resp_op is
-		when "00" =>
-		    return;
-		when "11" =>
-		    timeout := timeout + 1;
-		    assert timeout < 0
-			report "dmi_write timed out !" severity error;
-		when others =>
-		    assert 0 > 1 report "dmi_write got odd status: " &
-			to_hstring(resp_op) severity error;
-		end case;
-	    end loop;
-	end procedure dmi_write;
-	
-
-	procedure dmi_read(addr : in std_ulogic_vector(7 downto 0);
-			   data : out std_ulogic_vector(63 downto 0)) is
-	    variable resp_op   : std_ulogic_vector(1 downto 0);
-	    variable timeout   : integer;
-	begin
-	    send_command(addr, (others => '0'), "01");
-	    loop
-		read_resp(resp_op, data);
-		case resp_op is
-		when "00" =>
-		    return;
-		when "11" =>
-		    timeout := timeout + 1;
-		    assert timeout < 0
-			report "dmi_read timed out !" severity error;
-		when others =>
-		    assert 0 > 1 report "dmi_read got odd status: " &
-			to_hstring(resp_op) severity error;
-		end case;
-	    end loop;
-	end procedure dmi_read;
-
-	variable data : std_ulogic_vector(63 downto 0);
+        procedure clock(count: in INTEGER) is
+        begin
+            for i in 1 to count loop
+                j.tck <= '0';
+                wait for jclk_period/2;
+                j.tck <= '1';
+                wait for jclk_period/2;
+            end loop;
+        end procedure clock;
+
+        procedure shift_out(val: in std_ulogic_vector) is
+        begin
+            for i in 0 to val'length-1 loop
+                j.tdi <= val(i);
+                clock(1);
+            end loop;
+        end procedure shift_out;
+
+        procedure shift_in(val: out std_ulogic_vector) is
+        begin
+            for i in val'length-1 downto 0 loop
+                val := j.tdo & val(val'length-1 downto 1);
+                clock(1);
+            end loop;
+        end procedure shift_in;
+
+        procedure send_command(
+            addr : in std_ulogic_vector(7 downto 0);
+            data : in std_ulogic_vector(63 downto 0);
+            op   : in std_ulogic_vector(1 downto 0)) is
+        begin
+            j.capture <= '1';
+            clock(1);
+            j.capture <= '0';
+            clock(1);
+            j.shift <= '1';
+            shift_out(op);
+            shift_out(data);
+            shift_out(addr);
+            j.shift <= '0';
+            j.update <= '1';
+            clock(1);
+            j.update <= '0';
+            clock(1);
+        end procedure send_command;
+
+        procedure read_resp(
+            op   : out std_ulogic_vector(1 downto 0);
+            data : out std_ulogic_vector(63 downto 0)) is
+
+            variable addr : std_ulogic_vector(7 downto 0);
+        begin
+            j.capture <= '1';
+            clock(1);
+            j.capture <= '0';        
+            clock(1);
+            j.shift <= '1';
+            shift_in(op);
+            shift_in(data);
+            shift_in(addr);
+            j.shift <= '0';
+            j.update <= '1';
+            clock(1);
+            j.update <= '0';
+            clock(1);
+        end procedure read_resp;        
+
+        procedure dmi_write(addr : in std_ulogic_vector(7 downto 0);
+                            data : in std_ulogic_vector(63 downto 0)) is
+            variable resp_op   : std_ulogic_vector(1 downto 0);
+            variable resp_data : std_ulogic_vector(63 downto 0);
+            variable timeout   : integer;
+        begin
+            send_command(addr, data, "10");
+            loop
+                read_resp(resp_op, resp_data);
+                case resp_op is
+                when "00" =>
+                    return;
+                when "11" =>
+                    timeout := timeout + 1;
+                    assert timeout < 0
+                        report "dmi_write timed out !" severity error;
+                when others =>
+                    assert 0 > 1 report "dmi_write got odd status: " &
+                        to_hstring(resp_op) severity error;
+                end case;
+            end loop;
+        end procedure dmi_write;
+        
+
+        procedure dmi_read(addr : in std_ulogic_vector(7 downto 0);
+                           data : out std_ulogic_vector(63 downto 0)) is
+            variable resp_op   : std_ulogic_vector(1 downto 0);
+            variable timeout   : integer;
+        begin
+            send_command(addr, (others => '0'), "01");
+            loop
+                read_resp(resp_op, data);
+                case resp_op is
+                when "00" =>
+                    return;
+                when "11" =>
+                    timeout := timeout + 1;
+                    assert timeout < 0
+                        report "dmi_read timed out !" severity error;
+                when others =>
+                    assert 0 > 1 report "dmi_read got odd status: " &
+                        to_hstring(resp_op) severity error;
+                end case;
+            end loop;
+        end procedure dmi_read;
+
+        variable data : std_ulogic_vector(63 downto 0);
    begin
-	-- init & reset
-	j.reset <= '1';
-	j.sel <= "0000";
-	j.capture <= '0';
-	j.update <= '0';
-	j.shift <= '0';
-	j.tdi <= '0';
-	j.tms <= '0';
-	j.runtest <= '0';
-	clock(5);
-	j.reset <= '0';
-	clock(5);
-
-	-- select chain 2
-	j.sel <= "0010";
-	clock(1);
-
-	-- send command
-	dmi_read(x"00", data);
-	report "Read addr reg:" & to_hstring(data);
-	report "Writing addr reg to all 1's";
-	dmi_write(x"00", (others => '1'));
-	dmi_read(x"00", data);
-	report "Read addr reg:" & to_hstring(data);
-
-	report "Writing ctrl reg to all 1's";
-	dmi_write(x"02", (others => '1'));
-	dmi_read(x"02", data);
-	report "Read ctrl reg:" & to_hstring(data);
-
-	report "Read memory at 0...\n";
-	dmi_write(x"00", x"0000000000000000");
-	dmi_write(x"02", x"00000000000007ff");
-	dmi_read(x"01", data);
-	report "00:" & to_hstring(data);
-	dmi_read(x"01", data);
-	report "08:" & to_hstring(data);
-	dmi_read(x"01", data);
-	report "10:" & to_hstring(data);
-	dmi_read(x"01", data);
-	report "18:" & to_hstring(data);
-	clock(10);
-	std.env.finish;
+        -- init & reset
+        j.reset <= '1';
+        j.sel <= "0000";
+        j.capture <= '0';
+        j.update <= '0';
+        j.shift <= '0';
+        j.tdi <= '0';
+        j.tms <= '0';
+        j.runtest <= '0';
+        clock(5);
+        j.reset <= '0';
+        clock(5);
+
+        -- select chain 2
+        j.sel <= "0010";
+        clock(1);
+
+        -- send command
+        dmi_read(x"00", data);
+        report "Read addr reg:" & to_hstring(data);
+        report "Writing addr reg to all 1's";
+        dmi_write(x"00", (others => '1'));
+        dmi_read(x"00", data);
+        report "Read addr reg:" & to_hstring(data);
+
+        report "Writing ctrl reg to all 1's";
+        dmi_write(x"02", (others => '1'));
+        dmi_read(x"02", data);
+        report "Read ctrl reg:" & to_hstring(data);
+
+        report "Read memory at 0...\n";
+        dmi_write(x"00", x"0000000000000000");
+        dmi_write(x"02", x"00000000000007ff");
+        dmi_read(x"01", data);
+        report "00:" & to_hstring(data);
+        dmi_read(x"01", data);
+        report "08:" & to_hstring(data);
+        dmi_read(x"01", data);
+        report "10:" & to_hstring(data);
+        dmi_read(x"01", data);
+        report "18:" & to_hstring(data);
+        clock(10);
+        std.env.finish;
    end process;
 end behave;
--- a/dmi_dtm_xilinx.vhdl
+++ b/dmi_dtm_xilinx.vhdl
@ -66,59 +66,59 @@ use unisim.vcomponents.all;

 entity dmi_dtm is
    generic(ABITS : INTEGER:=8;
-	    DBITS : INTEGER:=32);
-
-    port(sys_clk	: in std_ulogic;
-	 sys_reset	: in std_ulogic;
-	 dmi_addr	: out std_ulogic_vector(ABITS - 1 downto 0);
-	 dmi_din	: in std_ulogic_vector(DBITS - 1 downto 0);
-	 dmi_dout	: out std_ulogic_vector(DBITS - 1 downto 0);
-	 dmi_req	: out std_ulogic;
-	 dmi_wr		: out std_ulogic;
-	 dmi_ack	: in std_ulogic
--	 dmi_err	: in std_ulogic TODO: Add error response
-	 );
+            DBITS : INTEGER:=32);
+
+    port(sys_clk   : in std_ulogic;
+         sys_reset : in std_ulogic;
+         dmi_addr  : out std_ulogic_vector(ABITS - 1 downto 0);
+         dmi_din   : in std_ulogic_vector(DBITS - 1 downto 0);
+         dmi_dout  : out std_ulogic_vector(DBITS - 1 downto 0);
+         dmi_req   : out std_ulogic;
+         dmi_wr    : out std_ulogic;
+         dmi_ack   : in std_ulogic
+--         dmi_err : in std_ulogic TODO: Add error response
+         );
 end entity dmi_dtm;

 architecture behaviour of dmi_dtm is

    -- Signals coming out of the BSCANE2 block
-    signal jtag_reset		: std_ulogic;
-    signal capture		: std_ulogic;
-    signal update		: std_ulogic;
-    signal drck			: std_ulogic;
-    signal jtag_clk		: std_ulogic;
-    signal sel			: std_ulogic;
-    signal shift		: std_ulogic;
-    signal tdi			: std_ulogic;
-    signal tdo			: std_ulogic;
-    signal tck			: std_ulogic;
+    signal jtag_reset : std_ulogic;
+    signal capture    : std_ulogic;
+    signal update     : std_ulogic;
+    signal drck       : std_ulogic;
+    signal jtag_clk   : std_ulogic;
+    signal sel        : std_ulogic;
+    signal shift      : std_ulogic;
+    signal tdi        : std_ulogic;
+    signal tdo        : std_ulogic;
+    signal tck        : std_ulogic;

    -- ** JTAG clock domain **

    -- Shift register
-    signal shiftr	: std_ulogic_vector(ABITS + DBITS + 1 downto 0);
+    signal shiftr : std_ulogic_vector(ABITS + DBITS + 1 downto 0);

    -- Latched request
-    signal request	: std_ulogic_vector(ABITS + DBITS + 1 downto 0);
+    signal request : std_ulogic_vector(ABITS + DBITS + 1 downto 0);

    -- A request is present
-    signal jtag_req	: std_ulogic;
+    signal jtag_req : std_ulogic;

    -- Synchronizer for jtag_rsp (sys clk -> jtag_clk)
-    signal dmi_ack_0	: std_ulogic;
-    signal dmi_ack_1	: std_ulogic;
+    signal dmi_ack_0 : std_ulogic;
+    signal dmi_ack_1 : std_ulogic;

    -- ** sys clock domain **

    -- Synchronizer for jtag_req (jtag clk -> sys clk)
-    signal jtag_req_0	: std_ulogic;
-    signal jtag_req_1	: std_ulogic;
+    signal jtag_req_0 : std_ulogic;
+    signal jtag_req_1 : std_ulogic;

    -- ** combination signals
-    signal jtag_bsy	: std_ulogic;
-    signal op_valid	: std_ulogic;
-    signal rsp_op	: std_ulogic_vector(1 downto 0);
+    signal jtag_bsy : std_ulogic;
+    signal op_valid : std_ulogic;
+    signal rsp_op   : std_ulogic_vector(1 downto 0);

    -- ** Constants **
    constant DMI_REQ_NOP : std_ulogic_vector(1 downto 0) := "00";
@ -137,22 +137,22 @@ begin
    -- Implement the Xilinx bscan2 for series 7 devices (TODO: use PoC to
    -- wrap this if compatibility is required with older devices).
    bscan : BSCANE2
-	generic map (
-	    JTAG_CHAIN		=> 2
-	    )
-	port map (
-	    CAPTURE		=> capture,
-	    DRCK		=> drck,
-	    RESET		=> jtag_reset,
-	    RUNTEST		=> open,
-	    SEL			=> sel,
-	    SHIFT		=> shift,
-	    TCK			=> tck,
-	    TDI			=> tdi,
-	    TMS			=> open,
-	    UPDATE		=> update,
-	    TDO			=> tdo
-	    );
+        generic map (
+            JTAG_CHAIN                => 2
+            )
+        port map (
+            CAPTURE => capture,
+            DRCK    => drck,
+            RESET   => jtag_reset,
+            RUNTEST => open,
+            SEL     => sel,
+            SHIFT   => shift,
+            TCK     => tck,
+            TDI     => tdi,
+            TMS     => open,
+            UPDATE  => update,
+            TDO     => tdo
+            );

    -- Some examples out there suggest buffering the clock so it's
    -- treated as a proper clock net. This is probably needed when using
@ -160,39 +160,39 @@ begin
    -- missing the update phase so maybe not...
    --
    clkbuf : BUFG
-	port map (
--	    I => drck,
-	    I => tck,
-	    O => jtag_clk
-	    );
+        port map (
+--            I => drck,
+            I => tck,
+            O => jtag_clk
+            );

    -- dmi_req synchronization
    dmi_req_sync : process(sys_clk)
    begin
-	-- sys_reset is synchronous
-	if rising_edge(sys_clk) then
-	    if (sys_reset = '1') then
-		jtag_req_0 <= '0';
-		jtag_req_1 <= '0';
-	    else
-		jtag_req_0 <= jtag_req;
-		jtag_req_1 <= jtag_req_0;
-	    end if;
-	end if;
+        -- sys_reset is synchronous
+        if rising_edge(sys_clk) then
+            if (sys_reset = '1') then
+                jtag_req_0 <= '0';
+                jtag_req_1 <= '0';
+            else
+                jtag_req_0 <= jtag_req;
+                jtag_req_1 <= jtag_req_0;
+            end if;
+        end if;
    end process;
    dmi_req <= jtag_req_1;

    -- dmi_ack synchronization
    dmi_ack_sync: process(jtag_clk, jtag_reset)
    begin
-	-- jtag_reset is async (see comments)
-	if jtag_reset = '1' then
-	    dmi_ack_0 <= '0';
-	    dmi_ack_1 <= '0';
-	elsif rising_edge(jtag_clk) then
-	    dmi_ack_0 <= dmi_ack;
-	    dmi_ack_1 <= dmi_ack_0;
-	end if;
+        -- jtag_reset is async (see comments)
+        if jtag_reset = '1' then
+            dmi_ack_0 <= '0';
+            dmi_ack_1 <= '0';
+        elsif rising_edge(jtag_clk) then
+            dmi_ack_0 <= dmi_ack;
+            dmi_ack_1 <= dmi_ack_0;
+        end if;
    end process;
   
    -- jtag_bsy indicates whether we can start a new request, we can when
@ -203,9 +203,9 @@ begin

    -- decode request type in shift register
    with shiftr(1 downto 0) select op_valid <=
-	'1' when DMI_REQ_RD,
-	'1' when DMI_REQ_WR,
-	'0' when others;
+        '1' when DMI_REQ_RD,
+        '1' when DMI_REQ_WR,
+        '0' when others;

    -- encode response op
    rsp_op <= DMI_RSP_BSY when jtag_bsy = '1' else DMI_RSP_OK;
@ -222,58 +222,59 @@ begin
    -- jtag_req latch. Could be split into 3 processes but it's probably
    -- not worthwhile.
    --
-    shifter: process(jtag_clk, jtag_reset)
+    shifter: process(jtag_clk, jtag_reset, sys_reset)
    begin
-	if jtag_reset = '1' then
-	    shiftr <= (others => '0');
-	    jtag_req <= '0';
-	elsif rising_edge(jtag_clk) then
-
-	    -- Handle jtag "commands" when sel is 1
-	    if sel = '1' then
-		-- Shift state, rotate the register
-		if shift = '1' then
-		    shiftr <= tdi & shiftr(ABITS + DBITS + 1 downto 1);
-		end if;
-
-		-- Update state (trigger)
-		--
-		-- Latch the request if we aren't already processing one and
-		-- it has a valid command opcode.
-		--
-	    	if update = '1' and op_valid = '1' then
-		    if jtag_bsy = '0' then
-			request <= shiftr;
-			jtag_req <= '1';
-		    end if;
-		    -- Set the shift register "op" to "busy". This will prevent
-		    -- us from re-starting the command on the next update if
-		    -- the command completes before that.
-		    shiftr(1 downto 0) <= DMI_RSP_BSY;
-		end if;
-
-		-- Request completion.
-		--
-		-- Capture the response data for reads and clear request flag.
-		--
-		-- Note: We clear req (and thus dmi_req) here which relies on tck
-		-- ticking and sel set. This means we are stuck with dmi_req up if
-		-- the jtag interface stops. Slaves must be resilient to this.
-		--
-		if jtag_req = '1' and dmi_ack_1 = '1' then
-		    jtag_req <= '0';
-		    if request(1 downto 0) = DMI_REQ_RD then
-			request(DBITS + 1 downto 2) <= dmi_din;
-		    end if;
-		end if;
-
-		-- Capture state, grab latch content with updated status
-		if capture = '1' then
-		    shiftr <= request(ABITS + DBITS + 1 downto 2) & rsp_op;
-		end if;
-
-	    end if;
-	end if;
+        if jtag_reset = '1' or sys_reset = '1' then
+            shiftr <= (others => '0');
+            jtag_req <= '0';
+            request <= (others => '0');
+        elsif rising_edge(jtag_clk) then
+
+            -- Handle jtag "commands" when sel is 1
+            if sel = '1' then
+                -- Shift state, rotate the register
+                if shift = '1' then
+                    shiftr <= tdi & shiftr(ABITS + DBITS + 1 downto 1);
+                end if;
+
+                -- Update state (trigger)
+                --
+                -- Latch the request if we aren't already processing one and
+                -- it has a valid command opcode.
+                --
+                    if update = '1' and op_valid = '1' then
+                    if jtag_bsy = '0' then
+                        request <= shiftr;
+                        jtag_req <= '1';
+                    end if;
+                    -- Set the shift register "op" to "busy". This will prevent
+                    -- us from re-starting the command on the next update if
+                    -- the command completes before that.
+                    shiftr(1 downto 0) <= DMI_RSP_BSY;
+                end if;
+
+                -- Request completion.
+                --
+                -- Capture the response data for reads and clear request flag.
+                --
+                -- Note: We clear req (and thus dmi_req) here which relies on tck
+                -- ticking and sel set. This means we are stuck with dmi_req up if
+                -- the jtag interface stops. Slaves must be resilient to this.
+                --
+                if jtag_req = '1' and dmi_ack_1 = '1' then
+                    jtag_req <= '0';
+                    if request(1 downto 0) = DMI_REQ_RD then
+                        request(DBITS + 1 downto 2) <= dmi_din;
+                    end if;
+                end if;
+
+                -- Capture state, grab latch content with updated status
+                if capture = '1' then
+                    shiftr <= request(ABITS + DBITS + 1 downto 2) & rsp_op;
+                end if;
+
+            end if;
+        end if;
    end process;
 end architecture behaviour;

--- a/dram_tb.vhdl
+++ b/dram_tb.vhdl
@ -44,6 +44,7 @@ begin
            DRAM_ABITS => 24,
            DRAM_ALINES => 1,
            DRAM_DLINES => 16,
+            DRAM_CKLINES => 1,
            DRAM_PORT_WIDTH => 128,
            PAYLOAD_FILE => DRAM_INIT_FILE,
            PAYLOAD_SIZE => DRAM_INIT_SIZE
@ -104,10 +105,10 @@ begin

    -- Read data receive queue
    data_queue: entity work.sync_fifo
-	generic map (
-	    DEPTH => 16,
-	    WIDTH => rd_data'length
-	    )
+        generic map (
+            DEPTH => 16,
+            WIDTH => rd_data'length
+            )
        port map (
            clk      => clk,
            reset    => soc_rst or reset_acks,
@ -250,10 +251,10 @@ begin
        report "Back to back 4 stores 4 reads on hit...";
        clr_acks;
        for i in 0 to 3 loop
-            wb_write(add_off(a, i*8), make_pattern(i), x"ff");
+            wb_write(add_off(a, i), make_pattern(i), x"ff");
        end loop;
        for i in 0 to 3 loop
-            wb_read(add_off(a, i*8));
+            wb_read(add_off(a, i));
        end loop;
        wait_acks(8);
        for i in 0 to 7 loop
@ -268,10 +269,10 @@ begin
        a(10) := '1';
        clr_acks;
        for i in 0 to 3 loop
-            wb_write(add_off(a, i*8), make_pattern(i), x"ff");
+            wb_write(add_off(a, i), make_pattern(i), x"ff");
        end loop;
        for i in 0 to 3 loop
-            wb_read(add_off(a, i*8));
+            wb_read(add_off(a, i));
        end loop;
        wait_acks(8);
        for i in 0 to 7 loop
@ -286,8 +287,8 @@ begin
        a(10) := '1';
        clr_acks;
        for i in 0 to 3 loop
-            wb_write(add_off(a, i*8), make_pattern(i), x"ff");
-            wb_read(add_off(a, i*8));
+            wb_write(add_off(a, i), make_pattern(i), x"ff");
+            wb_read(add_off(a, i));
        end loop;
        wait_acks(8);
        for i in 0 to 3 loop
@ -299,29 +300,29 @@ begin
        a(11) := '1';
        clr_acks;
        wb_write(add_off(a,  0), x"1111111100000000", x"ff");
-        wb_write(add_off(a,  8), x"3333333322222222", x"ff");
-        wb_write(add_off(a, 16), x"5555555544444444", x"ff");
-        wb_write(add_off(a, 24), x"7777777766666666", x"ff");
-        wb_write(add_off(a, 32), x"9999999988888888", x"ff");
-        wb_write(add_off(a, 40), x"bbbbbbbbaaaaaaaa", x"ff");
-        wb_write(add_off(a, 48), x"ddddddddcccccccc", x"ff");
-        wb_write(add_off(a, 56), x"ffffffffeeeeeeee", x"ff");
-        wb_write(add_off(a, 64), x"1111111100000000", x"ff");
-        wb_write(add_off(a, 72), x"3333333322222222", x"ff");
-        wb_write(add_off(a, 80), x"5555555544444444", x"ff");
-        wb_write(add_off(a, 88), x"7777777766666666", x"ff");
-        wb_write(add_off(a, 96), x"9999999988888888", x"ff");
-        wb_write(add_off(a,104), x"bbbbbbbbaaaaaaaa", x"ff");
-        wb_write(add_off(a,112), x"ddddddddcccccccc", x"ff");
-        wb_write(add_off(a,120), x"ffffffffeeeeeeee", x"ff");
+        wb_write(add_off(a,  1), x"3333333322222222", x"ff");
+        wb_write(add_off(a,  2), x"5555555544444444", x"ff");
+        wb_write(add_off(a,  3), x"7777777766666666", x"ff");
+        wb_write(add_off(a,  4), x"9999999988888888", x"ff");
+        wb_write(add_off(a,  5), x"bbbbbbbbaaaaaaaa", x"ff");
+        wb_write(add_off(a,  6), x"ddddddddcccccccc", x"ff");
+        wb_write(add_off(a,  7), x"ffffffffeeeeeeee", x"ff");
+        wb_write(add_off(a,  8), x"1111111100000000", x"ff");
+        wb_write(add_off(a,  9), x"3333333322222222", x"ff");
+        wb_write(add_off(a, 10), x"5555555544444444", x"ff");
+        wb_write(add_off(a, 11), x"7777777766666666", x"ff");
+        wb_write(add_off(a, 12), x"9999999988888888", x"ff");
+        wb_write(add_off(a, 13), x"bbbbbbbbaaaaaaaa", x"ff");
+        wb_write(add_off(a, 14), x"ddddddddcccccccc", x"ff");
+        wb_write(add_off(a, 15), x"ffffffffeeeeeeee", x"ff");
        wait_acks(16);

        report "Scattered from middle of line...";
        clr_acks;
-        wb_read(add_off(a,24));
-        wb_read(add_off(a,32));
+        wb_read(add_off(a, 3));
+        wb_read(add_off(a, 4));
        wb_read(add_off(a, 0));
-        wb_read(add_off(a,16));
+        wb_read(add_off(a, 2));
        wait_acks(4);
        read_data(d);
        assert d = x"7777777766666666" report "bad data (24), got " & to_hstring(d) severity failure;
--- a/execute1.vhdl
+++ b/execute1.vhdl
--- a/fetch1.vhdl
+++ b/fetch1.vhdl
@ -8,7 +8,8 @@ use work.common.all;
 entity fetch1 is
    generic(
 	RESET_ADDRESS     : std_logic_vector(63 downto 0) := (others => '0');
-	ALT_RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0')
+	ALT_RESET_ADDRESS : std_logic_vector(63 downto 0) := (others => '0');
+        HAS_BTC           : boolean := true
 	);
    port(
 	clk           : in std_ulogic;
@ -17,11 +18,12 @@ entity fetch1 is
 	-- Control inputs:
 	stall_in      : in std_ulogic;
 	flush_in      : in std_ulogic;
+        inval_btc     : in std_ulogic;
 	stop_in       : in std_ulogic;
 	alt_reset_in  : in std_ulogic;

-	-- redirect from execution unit
-	e_in          : in Execute1ToFetch1Type;
+	-- redirect from writeback unit
+	w_in          : in WritebackToFetch1Type;

        -- redirect from decode1
        d_in          : in Decode1ToFetch1Type;
@ -35,14 +37,28 @@ entity fetch1 is
 end entity fetch1;

 architecture behaviour of fetch1 is
-    type stop_state_t is (RUNNING, STOPPED, RESTARTING);
    type reg_internal_t is record
-	stop_state: stop_state_t;
        mode_32bit: std_ulogic;
+        rd_is_niap4: std_ulogic;
+        predicted_taken: std_ulogic;
+        pred_not_taken: std_ulogic;
+        predicted_nia: std_ulogic_vector(63 downto 0);
    end record;
    signal r, r_next : Fetch1ToIcacheType;
    signal r_int, r_next_int : reg_internal_t;
+    signal advance_nia : std_ulogic;
    signal log_nia : std_ulogic_vector(42 downto 0);
+
+    constant BTC_ADDR_BITS : integer := 10;
+    constant BTC_TAG_BITS : integer := 62 - BTC_ADDR_BITS;
+    constant BTC_TARGET_BITS : integer := 62;
+    constant BTC_SIZE : integer := 2 ** BTC_ADDR_BITS;
+    constant BTC_WIDTH : integer := BTC_TAG_BITS + BTC_TARGET_BITS + 1;
+    type btc_mem_type is array (0 to BTC_SIZE - 1) of std_ulogic_vector(BTC_WIDTH - 1 downto 0);
+
+    signal btc_rd_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0) := (others => '0');
+    signal btc_rd_valid : std_ulogic := '0';
+
 begin

    regs : process(clk)
@ -55,26 +71,84 @@ begin
                    " P:" & std_ulogic'image(r_next.priv_mode) &
                    " E:" & std_ulogic'image(r_next.big_endian) &
                    " 32:" & std_ulogic'image(r_next_int.mode_32bit) &
-		    " R:" & std_ulogic'image(e_in.redirect) & std_ulogic'image(d_in.redirect) &
+		    " R:" & std_ulogic'image(w_in.redirect) & std_ulogic'image(d_in.redirect) &
 		    " S:" & std_ulogic'image(stall_in) &
 		    " T:" & std_ulogic'image(stop_in) &
-		    " nia:" & to_hstring(r_next.nia) &
-		    " SM:" & std_ulogic'image(r_next.stop_mark);
+		    " nia:" & to_hstring(r_next.nia);
 	    end if;
-	    r <= r_next;
-	    r_int <= r_next_int;
+            if rst = '1' or w_in.redirect = '1' or d_in.redirect = '1' or stall_in = '0' then
+                r.virt_mode <= r_next.virt_mode;
+                r.priv_mode <= r_next.priv_mode;
+                r.big_endian <= r_next.big_endian;
+                r_int.mode_32bit <= r_next_int.mode_32bit;
+            end if;
+            if advance_nia = '1' then
+                r.predicted <= r_next.predicted;
+                r.pred_ntaken <= r_next.pred_ntaken;
+                r.nia <= r_next.nia;
+                r_int.predicted_taken <= r_next_int.predicted_taken;
+                r_int.pred_not_taken <= r_next_int.pred_not_taken;
+                r_int.predicted_nia <= r_next_int.predicted_nia;
+                r_int.rd_is_niap4 <= r_next_int.rd_is_niap4;
+            end if;
+            -- always send the up-to-date stop mark and req
+            r.stop_mark <= stop_in;
+            r.req <= not rst;
 	end if;
    end process;
    log_out <= log_nia;

+    btc : if HAS_BTC generate
+        signal btc_memory : btc_mem_type;
+        attribute ram_style : string;
+        attribute ram_style of btc_memory : signal is "block";
+
+        signal btc_valids : std_ulogic_vector(BTC_SIZE - 1 downto 0);
+        attribute ram_style of btc_valids : signal is "distributed";
+
+        signal btc_wr : std_ulogic;
+        signal btc_wr_data : std_ulogic_vector(BTC_WIDTH - 1 downto 0);
+        signal btc_wr_addr : std_ulogic_vector(BTC_ADDR_BITS - 1 downto 0);
+    begin
+        btc_wr_data <= w_in.br_taken &
+                       w_in.br_nia(63 downto BTC_ADDR_BITS + 2) &
+                       w_in.redirect_nia(63 downto 2);
+        btc_wr_addr <= w_in.br_nia(BTC_ADDR_BITS + 1 downto 2);
+        btc_wr <= w_in.br_last;
+
+        btc_ram : process(clk)
+            variable raddr : unsigned(BTC_ADDR_BITS - 1 downto 0);
+        begin
+            if rising_edge(clk) then
+                raddr := unsigned(r.nia(BTC_ADDR_BITS + 1 downto 2)) +
+                         to_unsigned(2, BTC_ADDR_BITS);
+                if advance_nia = '1' then
+                    btc_rd_data <= btc_memory(to_integer(raddr));
+                    btc_rd_valid <= btc_valids(to_integer(raddr));
+                end if;
+                if btc_wr = '1' then
+                    btc_memory(to_integer(unsigned(btc_wr_addr))) <= btc_wr_data;
+                end if;
+                if inval_btc = '1' or rst = '1' then
+                    btc_valids <= (others => '0');
+                elsif btc_wr = '1' then
+                    btc_valids(to_integer(unsigned(btc_wr_addr))) <= '1';
+                end if;
+            end if;
+        end process;
+    end generate;
+
    comb : process(all)
 	variable v : Fetch1ToIcacheType;
 	variable v_int : reg_internal_t;
-	variable increment : boolean;
    begin
 	v := r;
 	v_int := r_int;
-        v.sequential := '0';
+        v.predicted := '0';
+        v.pred_ntaken := '0';
+        v_int.predicted_taken := '0';
+        v_int.pred_not_taken := '0';
+        v_int.rd_is_niap4 := '0';

 	if rst = '1' then
 	    if alt_reset_in = '1' then
@ -85,78 +159,44 @@ begin
            v.virt_mode := '0';
            v.priv_mode := '1';
            v.big_endian := '0';
-	    v_int.stop_state := RUNNING;
            v_int.mode_32bit := '0';
-	elsif e_in.redirect = '1' then
-	    v.nia := e_in.redirect_nia(63 downto 2) & "00";
-            if e_in.mode_32bit = '1' then
+            v_int.predicted_nia := (others => '0');
+	elsif w_in.redirect = '1' then
+	    v.nia := w_in.redirect_nia(63 downto 2) & "00";
+            if w_in.mode_32bit = '1' then
                v.nia(63 downto 32) := (others => '0');
            end if;
-            v.virt_mode := e_in.virt_mode;
-            v.priv_mode := e_in.priv_mode;
-            v.big_endian := e_in.big_endian;
-            v_int.mode_32bit := e_in.mode_32bit;
+            v.virt_mode := w_in.virt_mode;
+            v.priv_mode := w_in.priv_mode;
+            v.big_endian := w_in.big_endian;
+            v_int.mode_32bit := w_in.mode_32bit;
        elsif d_in.redirect = '1' then
            v.nia := d_in.redirect_nia(63 downto 2) & "00";
            if r_int.mode_32bit = '1' then
                v.nia(63 downto 32) := (others => '0');
            end if;
-	elsif stall_in = '0' then
-
-	    -- For debug stop/step to work properly we need a little bit of
-	    -- trickery here. If we just stop incrementing and send stop marks
-	    -- when stop_in is set, then we'll increment on the cycle it clears
-	    -- and end up never executing the instruction we were stopped on.
-	    --
-	    -- Avoid this along with the opposite issue when stepping (stop is
-	    -- cleared for only one cycle) is handled by the state machine below
-	    --
-	    -- By default, increment addresses
-	    increment := true;
-	    case v_int.stop_state is
-	    when RUNNING =>
-		-- If we are running and stop_in is set, then stop incrementing,
-		-- we are now stopped.
-		if stop_in = '1' then
-		    increment := false;
-		    v_int.stop_state := STOPPED;
-		end if;
-	    when STOPPED =>
-		-- When stopped, never increment. If stop is cleared, go to state
-		-- "restarting" but still don't increment that cycle. stop_in is
-		-- now 0 so we'll send the NIA down without a stop mark.
-		increment := false;
-		if stop_in = '0' then
-		    v_int.stop_state := RESTARTING;
-		end if;
-	    when RESTARTING =>
-		-- We have just sent the NIA down, we can start incrementing again.
-		-- If stop_in is still not set, go back to running normally.
-		-- If stop_in is set again (that was a one-cycle "step"), go
-		-- back to "stopped" state which means we'll stop incrementing
-		-- on the next cycle. This ensures we increment the PC once after
-		-- sending one instruction without a stop mark. Since stop_in is
-		-- now set, the new PC will be sent with a stop mark and thus not
-		-- executed.
-		if stop_in = '0' then
-		    v_int.stop_state := RUNNING;
-		else
-		    v_int.stop_state := STOPPED;
-		end if;
-	    end case;
-
-	    if increment then
-                if r_int.mode_32bit = '0' then
-                    v.nia := std_ulogic_vector(unsigned(r.nia) + 4);
-                else
-                    v.nia := x"00000000" & std_ulogic_vector(unsigned(r.nia(31 downto 0)) + 4);
-                end if;
-                v.sequential := '1';
-	    end if;
-	end if;
+        elsif r_int.predicted_taken = '1' then
+            v.nia := r_int.predicted_nia;
+            v.predicted := '1';
+        else
+            v_int.rd_is_niap4 := '1';
+            v.pred_ntaken := r_int.pred_not_taken;
+            v.nia := std_ulogic_vector(unsigned(r.nia) + 4);
+            if r_int.mode_32bit = '1' then
+                v.nia(63 downto 32) := x"00000000";
+            end if;
+            if btc_rd_valid = '1' and r_int.rd_is_niap4 = '1' and
+                btc_rd_data(BTC_WIDTH - 2 downto BTC_TARGET_BITS)
+                = v.nia(BTC_TAG_BITS + BTC_ADDR_BITS + 1 downto BTC_ADDR_BITS + 2) then
+                v_int.predicted_taken := btc_rd_data(BTC_WIDTH - 1);
+                v_int.pred_not_taken := not btc_rd_data(BTC_WIDTH - 1);
+            end if;
+        end if;
+        v_int.predicted_nia := btc_rd_data(BTC_TARGET_BITS - 1 downto 0) & "00";

-	v.req := not rst;
-	v.stop_mark := stop_in;
+        -- If the last NIA value went down with a stop mark, it didn't get
+        -- executed, and hence we shouldn't increment NIA.
+        advance_nia <= rst or w_in.redirect or d_in.redirect or (not r.stop_mark and not stall_in);

 	r_next <= v;
 	r_next_int <= v_int;
--- a/foreign_random.vhdl
+++ b/foreign_random.vhdl
@ -0,0 +1,30 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.glibc_random.all;
+
+entity random is
+    port (
+        clk  : in std_ulogic;
+        data : out std_ulogic_vector(63 downto 0);
+        raw  : out std_ulogic_vector(63 downto 0);
+        err  : out std_ulogic
+        );
+end entity random;
+
+architecture behaviour of random is
+begin
+    err <= '0';
+
+    process(clk)
+        variable rand : std_ulogic_vector(63 downto 0);
+    begin
+        if rising_edge(clk) then
+            rand := pseudorand(64);
+            data <= rand;
+            raw <= rand;
+        end if;
+    end process;
+end behaviour;
--- a/fpga/arty_a7.xdc
+++ b/fpga/arty_a7.xdc
@ -9,15 +9,6 @@ set_property -dict { PACKAGE_PIN C2  IOSTANDARD LVCMOS33 } [get_ports { ext_rst_
 set_property -dict { PACKAGE_PIN D10 IOSTANDARD LVCMOS33 } [get_ports { uart_main_tx }];
 set_property -dict { PACKAGE_PIN A9  IOSTANDARD LVCMOS33 } [get_ports { uart_main_rx }];

-################################################################################
-# Pmod Header JC: UART (bottom)
-################################################################################
-
-set_property -dict { PACKAGE_PIN U14 IOSTANDARD LVCMOS33 } [get_ports { uart_pmod_cts_n }];
-set_property -dict { PACKAGE_PIN V14 IOSTANDARD LVCMOS33 } [get_ports { uart_pmod_tx }];
-set_property -dict { PACKAGE_PIN T13 IOSTANDARD LVCMOS33 } [get_ports { uart_pmod_rx }];
-set_property -dict { PACKAGE_PIN U13 IOSTANDARD LVCMOS33 } [get_ports { uart_pmod_rts_n }];
-
 ################################################################################
 # RGB LEDs
 ################################################################################
@ -25,6 +16,15 @@ set_property -dict { PACKAGE_PIN U13 IOSTANDARD LVCMOS33 } [get_ports { uart_pmo
 set_property -dict { PACKAGE_PIN E1  IOSTANDARD LVCMOS33 } [get_ports { led0_b }];
 set_property -dict { PACKAGE_PIN F6  IOSTANDARD LVCMOS33 } [get_ports { led0_g }];
 set_property -dict { PACKAGE_PIN G6  IOSTANDARD LVCMOS33 } [get_ports { led0_r }];
+#set_property -dict { PACKAGE_PIN G4  IOSTANDARD LVCMOS33 } [get_ports { led1_b }];
+#set_property -dict { PACKAGE_PIN J4  IOSTANDARD LVCMOS33 } [get_ports { led1_g }];
+#set_property -dict { PACKAGE_PIN G3  IOSTANDARD LVCMOS33 } [get_ports { led1_r }];
+#set_property -dict { PACKAGE_PIN H4  IOSTANDARD LVCMOS33 } [get_ports { led2_b }];
+#set_property -dict { PACKAGE_PIN J2  IOSTANDARD LVCMOS33 } [get_ports { led2_g }];
+#set_property -dict { PACKAGE_PIN J3  IOSTANDARD LVCMOS33 } [get_ports { led2_r }];
+#set_property -dict { PACKAGE_PIN K2  IOSTANDARD LVCMOS33 } [get_ports { led3_b }];
+#set_property -dict { PACKAGE_PIN H6  IOSTANDARD LVCMOS33 } [get_ports { led3_g }];
+#set_property -dict { PACKAGE_PIN K1  IOSTANDARD LVCMOS33 } [get_ports { led3_r }];

 ################################################################################
 # Normal LEDs
@ -50,6 +50,125 @@ set_property -dict { PACKAGE_PIN M14 IOSTANDARD LVCMOS33 } [get_ports { spi_flas
 set_property IOB true [get_cells -hierarchical -filter {NAME =~*/spi_rxtx/*sck_1*}]
 set_property IOB true [get_cells -hierarchical -filter {NAME =~*/spi_rxtx/input_delay_1.dat_i_l*}]

+################################################################################
+# PMOD header JA (standard, 200 ohm protection resisters)
+################################################################################
+
+#set_property -dict { PACKAGE_PIN G13 IOSTANDARD LVCMOS33 } [get_ports { pmod_ja_1 }];
+#set_property -dict { PACKAGE_PIN B11 IOSTANDARD LVCMOS33 } [get_ports { pmod_ja_2 }];
+#set_property -dict { PACKAGE_PIN A11 IOSTANDARD LVCMOS33 } [get_ports { pmod_ja_3 }];
+#set_property -dict { PACKAGE_PIN D12 IOSTANDARD LVCMOS33 } [get_ports { pmod_ja_4 }];
+#set_property -dict { PACKAGE_PIN D13 IOSTANDARD LVCMOS33 } [get_ports { pmod_ja_7 }];
+#set_property -dict { PACKAGE_PIN B18 IOSTANDARD LVCMOS33 } [get_ports { pmod_ja_8 }];
+#set_property -dict { PACKAGE_PIN A18 IOSTANDARD LVCMOS33 } [get_ports { pmod_ja_9 }];
+#set_property -dict { PACKAGE_PIN K16 IOSTANDARD LVCMOS33 } [get_ports { pmod_ja_10 }];
+
+# connection to Digilent PmodSD on JA
+set_property -dict { PACKAGE_PIN G13 IOSTANDARD LVCMOS33 SLEW FAST PULLUP TRUE } [get_ports { sdcard_data[3] }];
+set_property -dict { PACKAGE_PIN B11 IOSTANDARD LVCMOS33 SLEW FAST PULLUP TRUE } [get_ports { sdcard_cmd }];
+set_property -dict { PACKAGE_PIN A11 IOSTANDARD LVCMOS33 SLEW FAST PULLUP TRUE } [get_ports { sdcard_data[0] }];
+set_property -dict { PACKAGE_PIN D12 IOSTANDARD LVCMOS33 SLEW FAST } [get_ports { sdcard_clk }];
+set_property -dict { PACKAGE_PIN D13 IOSTANDARD LVCMOS33 SLEW FAST PULLUP TRUE } [get_ports { sdcard_data[1] }];
+set_property -dict { PACKAGE_PIN B18 IOSTANDARD LVCMOS33 SLEW FAST PULLUP TRUE } [get_ports { sdcard_data[2] }];
+set_property -dict { PACKAGE_PIN A18 IOSTANDARD LVCMOS33 } [get_ports { sdcard_cd }];
+#set_property -dict { PACKAGE_PIN K16 IOSTANDARD LVCMOS33 } [get_ports { sdcard_wp }];
+
+# Put registers into IOBs to improve timing
+set_property IOB true [get_cells -hierarchical -filter {NAME =~*.litesdcard/sdcard_*}]
+
+################################################################################
+# PMOD header JB (high-speed, no protection resisters)
+################################################################################
+
+#set_property -dict { PACKAGE_PIN E15 IOSTANDARD LVCMOS33 } [get_ports { pmod_jb_1 }];
+#set_property -dict { PACKAGE_PIN E16 IOSTANDARD LVCMOS33 } [get_ports { pmod_jb_2 }];
+#set_property -dict { PACKAGE_PIN D15 IOSTANDARD LVCMOS33 } [get_ports { pmod_jb_3 }];
+#set_property -dict { PACKAGE_PIN C15 IOSTANDARD LVCMOS33 } [get_ports { pmod_jb_4 }];
+#set_property -dict { PACKAGE_PIN J17 IOSTANDARD LVCMOS33 } [get_ports { pmod_jb_7 }];
+#set_property -dict { PACKAGE_PIN J18 IOSTANDARD LVCMOS33 } [get_ports { pmod_jb_8 }];
+#set_property -dict { PACKAGE_PIN K15 IOSTANDARD LVCMOS33 } [get_ports { pmod_jb_9 }];
+#set_property -dict { PACKAGE_PIN J15 IOSTANDARD LVCMOS33 } [get_ports { pmod_jb_10 }];
+
+# connection to Digilent PmodSD on JB
+#set_property -dict { PACKAGE_PIN E15 IOSTANDARD LVCMOS33 SLEW FAST PULLUP TRUE } [get_ports { sdcard_data[3] }];
+#set_property -dict { PACKAGE_PIN E16 IOSTANDARD LVCMOS33 SLEW FAST PULLUP TRUE } [get_ports { sdcard_cmd }];
+#set_property -dict { PACKAGE_PIN D15 IOSTANDARD LVCMOS33 SLEW FAST PULLUP TRUE } [get_ports { sdcard_data[0] }];
+#set_property -dict { PACKAGE_PIN C15 IOSTANDARD LVCMOS33 SLEW FAST } [get_ports { sdcard_clk }];
+#set_property -dict { PACKAGE_PIN J17 IOSTANDARD LVCMOS33 SLEW FAST PULLUP TRUE } [get_ports { sdcard_data[1] }];
+#set_property -dict { PACKAGE_PIN J18 IOSTANDARD LVCMOS33 SLEW FAST PULLUP TRUE } [get_ports { sdcard_data[2] }];
+#set_property -dict { PACKAGE_PIN K15 IOSTANDARD LVCMOS33 } [get_ports { sdcard_cd }];
+#set_property -dict { PACKAGE_PIN J15 IOSTANDARD LVCMOS33 } [get_ports { sdcard_wp }];
+
+################################################################################
+# PMOD header JC (high-speed, no protection resisters)
+################################################################################
+
+#set_property -dict { PACKAGE_PIN U12 IOSTANDARD LVCMOS33 } [get_ports { pmod_jc_1 }];
+#set_property -dict { PACKAGE_PIN V12 IOSTANDARD LVCMOS33 } [get_ports { pmod_jc_2 }];
+#set_property -dict { PACKAGE_PIN V10 IOSTANDARD LVCMOS33 } [get_ports { pmod_jc_3 }];
+#set_property -dict { PACKAGE_PIN V11 IOSTANDARD LVCMOS33 } [get_ports { pmod_jc_4 }];
+#set_property -dict { PACKAGE_PIN U14 IOSTANDARD LVCMOS33 } [get_ports { pmod_jc_7 }];
+#set_property -dict { PACKAGE_PIN V14 IOSTANDARD LVCMOS33 } [get_ports { pmod_jc_8 }];
+#set_property -dict { PACKAGE_PIN T13 IOSTANDARD LVCMOS33 } [get_ports { pmod_jc_9 }];
+#set_property -dict { PACKAGE_PIN U13 IOSTANDARD LVCMOS33 } [get_ports { pmod_jc_10 }];
+
+################################################################################
+# PMOD header JD (standard, 200 ohm protection resisters)
+################################################################################
+
+#set_property -dict { PACKAGE_PIN D4 IOSTANDARD LVCMOS33 } [get_ports { pmod_jd_1 }];
+#set_property -dict { PACKAGE_PIN D3 IOSTANDARD LVCMOS33 } [get_ports { pmod_jd_2 }];
+#set_property -dict { PACKAGE_PIN F4 IOSTANDARD LVCMOS33 } [get_ports { pmod_jd_3 }];
+#set_property -dict { PACKAGE_PIN F3 IOSTANDARD LVCMOS33 } [get_ports { pmod_jd_4 }];
+#set_property -dict { PACKAGE_PIN E2 IOSTANDARD LVCMOS33 } [get_ports { pmod_jd_7 }];
+#set_property -dict { PACKAGE_PIN D2 IOSTANDARD LVCMOS33 } [get_ports { pmod_jd_8 }];
+#set_property -dict { PACKAGE_PIN H2 IOSTANDARD LVCMOS33 } [get_ports { pmod_jd_9 }];
+#set_property -dict { PACKAGE_PIN G2 IOSTANDARD LVCMOS33 } [get_ports { pmod_jd_10 }];
+
+################################################################################
+# Arduino/chipKIT shield connector
+################################################################################
+
+set_property -dict { PACKAGE_PIN V15 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[0] }];
+set_property -dict { PACKAGE_PIN U16 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[1] }];
+set_property -dict { PACKAGE_PIN P14 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[2] }];
+set_property -dict { PACKAGE_PIN T11 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[3] }];
+set_property -dict { PACKAGE_PIN R12 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[4] }];
+set_property -dict { PACKAGE_PIN T14 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[5] }];
+set_property -dict { PACKAGE_PIN T15 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[6] }];
+set_property -dict { PACKAGE_PIN T16 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[7] }];
+set_property -dict { PACKAGE_PIN N15 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[8] }];
+set_property -dict { PACKAGE_PIN M16 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[9] }];
+set_property -dict { PACKAGE_PIN V17 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[10] }];
+set_property -dict { PACKAGE_PIN U18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[11] }];
+set_property -dict { PACKAGE_PIN R17 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[12] }];
+set_property -dict { PACKAGE_PIN P17 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[13] }];
+set_property -dict { PACKAGE_PIN U11 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[26] }];
+set_property -dict { PACKAGE_PIN V16 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[27] }];
+set_property -dict { PACKAGE_PIN M13 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[28] }];
+set_property -dict { PACKAGE_PIN R10 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[29] }];
+set_property -dict { PACKAGE_PIN R11 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[30] }];
+set_property -dict { PACKAGE_PIN R13 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[31] }];
+set_property -dict { PACKAGE_PIN R15 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[32] }];
+set_property -dict { PACKAGE_PIN P15 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[33] }];
+set_property -dict { PACKAGE_PIN R16 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[34] }];
+set_property -dict { PACKAGE_PIN N16 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[35] }];
+set_property -dict { PACKAGE_PIN N14 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[36] }];
+set_property -dict { PACKAGE_PIN U17 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[37] }];
+set_property -dict { PACKAGE_PIN T18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[38] }];
+set_property -dict { PACKAGE_PIN R18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[39] }];
+set_property -dict { PACKAGE_PIN P18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[40] }];
+set_property -dict { PACKAGE_PIN N17 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[41] }];
+set_property -dict { PACKAGE_PIN M17 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[42] }]; # A
+set_property -dict { PACKAGE_PIN L18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[43] }]; # SCL
+set_property -dict { PACKAGE_PIN M18 IOSTANDARD LVCMOS33 PULLDOWN TRUE } [get_ports { shield_io[44] }]; # SDA
+#set_property -dict { PACKAGE_PIN C2  IOSTANDARD LVCMOS33 } [get_ports { shield_rst }];
+
+#set_property -dict { PACKAGE_PIN C1  IOSTANDARD LVCMOS33 } [get_ports { spi_hdr_ss }];
+#set_property -dict { PACKAGE_PIN F1  IOSTANDARD LVCMOS33 } [get_ports { spi_hdr_clk }];
+#set_property -dict { PACKAGE_PIN H1  IOSTANDARD LVCMOS33 } [get_ports { spi_hdr_mosi }];
+#set_property -dict { PACKAGE_PIN G1  IOSTANDARD LVCMOS33 } [get_ports { spi_hdr_miso }];
+
 ################################################################################
 # Ethernet (generated by LiteX)
 ################################################################################
@ -412,7 +531,7 @@ set_property CONFIG_MODE SPIx4 [current_design]
 # Clock constraints
 ################################################################################

-create_clock -add -name sys_clk_pin -period 10.00 -waveform {0 5} [get_ports { ext_clk }];
+create_clock -name sys_clk_pin -period 10.00 [get_ports { ext_clk }];

 create_clock -name eth_rx_clk -period 40.0 [get_ports { eth_clocks_rx }]

--- a/fpga/clk_gen_ecp5.vhd
+++ b/fpga/clk_gen_ecp5.vhd
@ -8,11 +8,11 @@ entity clock_generator is
        );

    port (
-	ext_clk        : in  std_logic;
-	pll_rst_in   : in  std_logic;
-	pll_clk_out : out std_logic;
-	pll_locked_out : out std_logic
-	);
+        ext_clk        : in  std_logic;
+        pll_rst_in   : in  std_logic;
+        pll_clk_out : out std_logic;
+        pll_locked_out : out std_logic
+        );

 end entity clock_generator;

@ -20,108 +20,117 @@ architecture bypass of clock_generator is

    -- prototype of ECP5 PLL
    component EHXPLLL is
-	generic (
-	    CLKI_DIV : integer := 1;
-	    CLKFB_DIV : integer := 1;
-	    CLKOP_DIV : integer := 8;
-	    CLKOS_DIV : integer := 8;
-	    CLKOS2_DIV : integer := 8;
-	    CLKOS3_DIV : integer := 8;
-	    CLKOP_ENABLE : string := "ENABLED";
-	    CLKOS_ENABLE : string := "DISABLED";
-	    CLKOS2_ENABLE : string := "DISABLED";
-	    CLKOS3_ENABLE : string := "DISABLED";
-	    CLKOP_CPHASE : integer := 0;
-	    CLKOS_CPHASE : integer := 0;
-	    CLKOS2_CPHASE : integer := 0;
-	    CLKOS3_CPHASE : integer := 0;
-	    CLKOP_FPHASE : integer := 0;
-	    CLKOS_FPHASE : integer := 0;
-	    CLKOS2_FPHASE : integer := 0;
-	    CLKOS3_FPHASE : integer := 0;
-	    FEEDBK_PATH : string := "CLKOP";
-	    CLKOP_TRIM_POL : string := "RISING";
-	    CLKOP_TRIM_DELAY : integer := 0;
-	    CLKOS_TRIM_POL : string := "RISING";
-	    CLKOS_TRIM_DELAY : integer := 0;
-	    OUTDIVIDER_MUXA : string := "DIVA";
-	    OUTDIVIDER_MUXB : string := "DIVB";
-	    OUTDIVIDER_MUXC : string := "DIVC";
-	    OUTDIVIDER_MUXD : string := "DIVD";
-	    PLL_LOCK_MODE : integer := 0;
-	    PLL_LOCK_DELAY : integer := 200;
-	    STDBY_ENABLE : string := "DISABLED";
-	    REFIN_RESET : string := "DISABLED";
-	    SYNC_ENABLE : string := "DISABLED";
-	    INT_LOCK_STICKY : string := "ENABLED";
-	    DPHASE_SOURCE : string := "DISABLED";
-	    PLLRST_ENA : string := "DISABLED";
-	    INTFB_WAKE : string := "DISABLED"  );
-	port (
-	    CLKI :   in  std_logic;
-	    CLKFB :   in  std_logic;
-	    PHASESEL1 :   in  std_logic;
-	    PHASESEL0 :   in  std_logic;
-	    PHASEDIR :   in  std_logic;
-	    PHASESTEP :   in  std_logic;
-	    PHASELOADREG :   in  std_logic;
-	    STDBY :   in  std_logic;
-	    PLLWAKESYNC :   in  std_logic;
-	    RST :   in  std_logic;
-	    ENCLKOP :   in  std_logic;
-	    ENCLKOS :   in  std_logic;
-	    ENCLKOS2 :   in  std_logic;
-	    ENCLKOS3 :   in  std_logic;
-	    CLKOP :   out  std_logic;
-	    CLKOS :   out  std_logic;
-	    CLKOS2 :   out  std_logic;
-	    CLKOS3 :   out  std_logic;
-	    LOCK :   out  std_logic;
-	    INTLOCK :   out  std_logic;
-	    REFCLK :   out  std_logic;
-	    CLKINTFB :   out  std_logic  );
+        generic (
+            CLKI_DIV : integer := 1;
+            CLKFB_DIV : integer := 1;
+            CLKOP_DIV : integer := 8;
+            CLKOS_DIV : integer := 8;
+            CLKOS2_DIV : integer := 8;
+            CLKOS3_DIV : integer := 8;
+            CLKOP_ENABLE : string := "ENABLED";
+            CLKOS_ENABLE : string := "DISABLED";
+            CLKOS2_ENABLE : string := "DISABLED";
+            CLKOS3_ENABLE : string := "DISABLED";
+            CLKOP_CPHASE : integer := 0;
+            CLKOS_CPHASE : integer := 0;
+            CLKOS2_CPHASE : integer := 0;
+            CLKOS3_CPHASE : integer := 0;
+            CLKOP_FPHASE : integer := 0;
+            CLKOS_FPHASE : integer := 0;
+            CLKOS2_FPHASE : integer := 0;
+            CLKOS3_FPHASE : integer := 0;
+            FEEDBK_PATH : string := "CLKOP";
+            CLKOP_TRIM_POL : string := "RISING";
+            CLKOP_TRIM_DELAY : integer := 0;
+            CLKOS_TRIM_POL : string := "RISING";
+            CLKOS_TRIM_DELAY : integer := 0;
+            OUTDIVIDER_MUXA : string := "DIVA";
+            OUTDIVIDER_MUXB : string := "DIVB";
+            OUTDIVIDER_MUXC : string := "DIVC";
+            OUTDIVIDER_MUXD : string := "DIVD";
+            PLL_LOCK_MODE : integer := 0;
+            PLL_LOCK_DELAY : integer := 200;
+            STDBY_ENABLE : string := "DISABLED";
+            REFIN_RESET : string := "DISABLED";
+            SYNC_ENABLE : string := "DISABLED";
+            INT_LOCK_STICKY : string := "ENABLED";
+            DPHASE_SOURCE : string := "DISABLED";
+            PLLRST_ENA : string := "DISABLED";
+            INTFB_WAKE : string := "DISABLED"  );
+        port (
+            CLKI :   in  std_logic;
+            CLKFB :   in  std_logic;
+            PHASESEL1 :   in  std_logic;
+            PHASESEL0 :   in  std_logic;
+            PHASEDIR :   in  std_logic;
+            PHASESTEP :   in  std_logic;
+            PHASELOADREG :   in  std_logic;
+            STDBY :   in  std_logic;
+            PLLWAKESYNC :   in  std_logic;
+            RST :   in  std_logic;
+            ENCLKOP :   in  std_logic;
+            ENCLKOS :   in  std_logic;
+            ENCLKOS2 :   in  std_logic;
+            ENCLKOS3 :   in  std_logic;
+            CLKOP :   out  std_logic;
+            CLKOS :   out  std_logic;
+            CLKOS2 :   out  std_logic;
+            CLKOS3 :   out  std_logic;
+            LOCK :   out  std_logic;
+            INTLOCK :   out  std_logic;
+            REFCLK :   out  std_logic;
+            CLKINTFB :   out  std_logic  );
    end component;

+    signal clkos : std_ulogic;
    signal clkop : std_logic;
    signal lock : std_logic;

-    -- PLL constants based on prjtrellis example
-    constant PLL_IN : natural :=    2000000;
-    constant PLL_OUT : natural := 600000000;
+    -- PLL constants
+    -- According to the datasheet, PLL_IN needs to be between 10 and 400 MHz
+    -- PLL_OUT needs to be between 400 and 800 MHz
+    -- PLL_IN is chosen based on 12 and 48 MHz being common values
+    -- for the reference clock.
+    constant PLL_IN : natural :=   12000000;
+    constant PLL_OUT : natural := 480000000;

    -- Configration for ECP5 PLL
    constant PLL_CLKOP_DIV : natural := PLL_OUT/CLK_OUTPUT_HZ;
-    constant PLL_CLKFB_DIV : natural := CLK_OUTPUT_HZ/PLL_IN;
+    constant PLL_CLKOS_DIV : natural := 2;
+    constant PLL_CLKFB_DIV : natural := PLL_OUT/PLL_CLKOS_DIV/PLL_IN;
    constant PLL_CLKI_DIV  : natural := CLK_INPUT_HZ/PLL_IN;

 begin
    pll_clk_out <= clkop;
-    pll_locked_out <= not lock; -- FIXME: EHXPLLL lock signal active low?!?
+    pll_locked_out <= lock;

    clkgen: EHXPLLL
-	generic map(
-	    CLKOP_CPHASE => 11, -- FIXME: Copied from prjtrells.
+        generic map(
            CLKOP_DIV => PLL_CLKOP_DIV,
-	    CLKFB_DIV => PLL_CLKFB_DIV,
-	    CLKI_DIV  => PLL_CLKI_DIV
-	)
-	port map (
-	    CLKI => ext_clk,
-	    CLKOP => clkop,
-	    CLKFB => clkop,
-	    LOCK => lock,
-	    RST => pll_rst_in,
-	    PHASESEL1 => '0',
-	    PHASESEL0 => '0',
-	    PHASEDIR => '0',
-	    PHASESTEP => '0',
-	    PHASELOADREG => '0',
-	    STDBY => '0',
-	    PLLWAKESYNC => '0',
-	    ENCLKOP => '0',
-	    ENCLKOS => '0',
-	    ENCLKOS2 => '0',
-	    ENCLKOS3 => '0'
+            CLKOS_ENABLE => "ENABLED",
+            CLKOS_DIV => PLL_CLKOS_DIV,
+            CLKFB_DIV => PLL_CLKFB_DIV,
+            CLKI_DIV  => PLL_CLKI_DIV,
+            FEEDBK_PATH => "CLKOS"
+        )
+        port map (
+            CLKI => ext_clk,
+            CLKOP => clkop,
+            CLKOS => clkos,
+            CLKFB => clkos,
+            LOCK => lock,
+            RST => pll_rst_in,
+            PHASESEL1 => '0',
+            PHASESEL0 => '0',
+            PHASEDIR => '0',
+            PHASESTEP => '0',
+            PHASELOADREG => '0',
+            STDBY => '0',
+            PLLWAKESYNC => '0',
+            ENCLKOP => '1',
+            ENCLKOS => '1',
+            ENCLKOS2 => '0',
+            ENCLKOS3 => '0'
    );

 end architecture bypass;
--- a/fpga/clk_gen_mcmm.vhd
+++ b/fpga/clk_gen_mcmm.vhd
@ -8,7 +8,7 @@ entity clock_generator is
    generic (
        CLK_INPUT_HZ  : positive := 12000000;
        CLK_OUTPUT_HZ : positive := 50000000
-	);
+        );
    port (
        ext_clk        : in  std_logic;
        pll_rst_in     : in  std_logic;
@ -24,66 +24,66 @@ architecture rtl of clock_generator is
        clkfbout_mult : real range 2.0 to 64.0;
        clkout_divide : real range 1.0 to 128.0;
        divclk_divide : integer range 1 to 106;
-	force_rst     : std_ulogic;
+        force_rst     : std_ulogic;
    end record;

    function gen_pll_settings (
        constant input_hz : positive;
-	constant output_hz : positive)
+        constant output_hz : positive)
        return pll_settings_t is

-	constant bad_settings : pll_settings_t :=
-	    (clkin_period  => 0.0,
-	     clkfbout_mult => 2.0,
-	     clkout_divide => 1.0,
-	     divclk_divide => 1,
-	     force_rst     => '1');
+        constant bad_settings : pll_settings_t :=
+            (clkin_period  => 0.0,
+             clkfbout_mult => 2.0,
+             clkout_divide => 1.0,
+             divclk_divide => 1,
+             force_rst     => '1');
    begin
        case input_hz is
-	when 100000000 =>
-	    case output_hz is
-	    when 100000000 =>
-		return (clkin_period  => 10.0,
-			clkfbout_mult => 16.0,
-			clkout_divide => 16.0,
-			divclk_divide => 1,
-			force_rst     => '0');
-	    when  50000000 =>
-		return (clkin_period  => 10.0,
-			clkfbout_mult => 16.0,
-			clkout_divide => 32.0,
-			divclk_divide => 1,
-			force_rst     => '0');
-	    when others =>
-		report "Unsupported output frequency" severity failure;
-		return bad_settings;
-	    end case;
-	when 12000000 =>
-	    case output_hz is
-	    when 100000000 =>
-		return (clkin_period  => 83.33,
-			clkfbout_mult => 50.0,
-			clkout_divide => 6.0,
-			divclk_divide => 1,
-			force_rst     => '0');			
-	    when  50000000 =>
-		return (clkin_period  => 83.33,
-			clkfbout_mult => 50.0,
-			clkout_divide => 12.0,
-			divclk_divide => 1,
-			force_rst     => '0');
-	    when others =>
-		report "Unsupported output frequency" severity failure;
-		return bad_settings;
-	    end case;
-	when others =>
-	    report "Unsupported input frequency" severity failure;
-	    return bad_settings;
+        when 100000000 =>
+            case output_hz is
+            when 100000000 =>
+                return (clkin_period  => 10.0,
+                        clkfbout_mult => 16.0,
+                        clkout_divide => 16.0,
+                        divclk_divide => 1,
+                        force_rst     => '0');
+            when  50000000 =>
+                return (clkin_period  => 10.0,
+                        clkfbout_mult => 16.0,
+                        clkout_divide => 32.0,
+                        divclk_divide => 1,
+                        force_rst     => '0');
+            when others =>
+                report "Unsupported output frequency" severity failure;
+                return bad_settings;
+            end case;
+        when 12000000 =>
+            case output_hz is
+            when 100000000 =>
+                return (clkin_period  => 83.33,
+                        clkfbout_mult => 50.0,
+                        clkout_divide => 6.0,
+                        divclk_divide => 1,
+                        force_rst     => '0');
+            when  50000000 =>
+                return (clkin_period  => 83.33,
+                        clkfbout_mult => 50.0,
+                        clkout_divide => 12.0,
+                        divclk_divide => 1,
+                        force_rst     => '0');
+            when others =>
+                report "Unsupported output frequency" severity failure;
+                return bad_settings;
+            end case;
+        when others =>
+            report "Unsupported input frequency" severity failure;
+            return bad_settings;
        end case;
    end function gen_pll_settings;

    constant pll_settings : pll_settings_t := gen_pll_settings(clk_input_hz,
-							       clk_output_hz);
+                                                               clk_output_hz);
 begin
    pll : MMCME2_BASE
        generic map (
@ -111,6 +111,6 @@ begin
            CLKFBIN    => clkfb,
            CLKIN1     => ext_clk,
            PWRDWN     => '0',
-	    RST      => pll_rst_in or pll_settings.force_rst
+            RST      => pll_rst_in or pll_settings.force_rst
            );
 end architecture rtl;
--- a/fpga/clk_gen_plle2.vhd
+++ b/fpga/clk_gen_plle2.vhd
@ -6,100 +6,112 @@ use UNISIM.vcomponents.all;

 entity clock_generator is
    generic (
-	CLK_INPUT_HZ  : positive := 100000000;
-	CLK_OUTPUT_HZ : positive := 100000000
-	);
+        CLK_INPUT_HZ  : positive := 100000000;
+        CLK_OUTPUT_HZ : positive := 100000000
+        );
    port (
-	ext_clk        : in  std_logic;
-	pll_rst_in   : in  std_logic;
-	pll_clk_out    : out std_logic;
-	pll_locked_out : out std_logic);
+        ext_clk        : in  std_logic;
+        pll_rst_in   : in  std_logic;
+        pll_clk_out    : out std_logic;
+        pll_locked_out : out std_logic);
 end entity clock_generator;

 architecture rtl of clock_generator is
    signal clkfb : std_ulogic;

    type pll_settings_t is record
-	clkin_period  : real    range 0.000 to 52.631;
-	clkfbout_mult : integer range 2 to 64;
-	clkout_divide : integer range 1 to 128;
-	divclk_divide : integer range 1 to 56;
-	force_rst     : std_ulogic;
+        clkin_period  : real    range 0.000 to 52.631;
+        clkfbout_mult : integer range 2 to 64;
+        clkout_divide : integer range 1 to 128;
+        divclk_divide : integer range 1 to 56;
+        force_rst     : std_ulogic;
    end record;

    function gen_pll_settings (
        constant input_hz : positive;
-	constant output_hz : positive)
-	return pll_settings_t is
+        constant output_hz : positive)
+        return pll_settings_t is

-	constant bad_settings : pll_settings_t :=
-	    (clkin_period  => 0.0,
-	     clkfbout_mult => 2,
-	     clkout_divide => 1,
-	     divclk_divide => 1,
-	     force_rst     => '1');
+        constant bad_settings : pll_settings_t :=
+            (clkin_period  => 0.0,
+             clkfbout_mult => 2,
+             clkout_divide => 1,
+             divclk_divide => 1,
+             force_rst     => '1');
    begin
-	case input_hz is
-	when 200000000 =>
-	    case output_hz is
-	    when 100000000 =>
-		return (clkin_period  => 5.0,
-			clkfbout_mult => 8,
-			clkout_divide => 16,
-			divclk_divide => 1,
-			force_rst     => '0');
-	    when others =>
-		report "Unsupported output frequency" severity failure;
-		return bad_settings;
-	    end case;
-	when 100000000 =>
-	    case output_hz is
-	    when 100000000 =>
-		return (clkin_period  => 10.0,
-			clkfbout_mult => 16,
-			clkout_divide => 16,
-			divclk_divide => 1,
-			force_rst     => '0');
-	    when  50000000 =>
-		return (clkin_period  => 10.0,
-			clkfbout_mult => 16,
-			clkout_divide => 32,
-			divclk_divide => 1,
-			force_rst     => '0');
-	    when others =>
-		report "Unsupported output frequency" severity failure;
-		return bad_settings;
-	    end case;
-	when others =>
-	    report "Unsupported input frequency" severity failure;
-	    return bad_settings;
-	end case;
+        case input_hz is
+        when 200000000 =>
+            case output_hz is
+            when 100000000 =>
+                return (clkin_period  => 5.0,
+                        clkfbout_mult => 8,
+                        clkout_divide => 16,
+                        divclk_divide => 1,
+                        force_rst     => '0');
+            when others =>
+                report "Unsupported output frequency" severity failure;
+                return bad_settings;
+            end case;
+        when 100000000 =>
+            case output_hz is
+            when 100000000 =>
+                return (clkin_period  => 10.0,
+                        clkfbout_mult => 16,
+                        clkout_divide => 16,
+                        divclk_divide => 1,
+                        force_rst     => '0');
+            when  50000000 =>
+                return (clkin_period  => 10.0,
+                        clkfbout_mult => 16,
+                        clkout_divide => 32,
+                        divclk_divide => 1,
+                        force_rst     => '0');
+            when others =>
+                report "Unsupported output frequency" severity failure;
+                return bad_settings;
+            end case;
+        when 50000000 =>
+            case output_hz is
+            when 100000000 =>
+                return (clkin_period  => 20.0,
+                        clkfbout_mult => 32,
+                        clkout_divide => 16,
+                        divclk_divide => 1,
+                        force_rst     => '0');
+            when others =>
+                report "Unsupported output frequency" severity failure;
+                return bad_settings;
+            end case;
+        when others =>
+            report "Unsupported input frequency" severity failure;
+            return bad_settings;
+        end case;
    end function gen_pll_settings;

    constant pll_settings : pll_settings_t := gen_pll_settings(clk_input_hz,
-							       clk_output_hz);
+                                                               clk_output_hz);
 begin

    pll : PLLE2_BASE
-	generic map (
-	    BANDWIDTH          => "OPTIMIZED",
-	    CLKFBOUT_MULT      => pll_settings.clkfbout_mult,
-	    CLKIN1_PERIOD      => pll_settings.clkin_period,
-	    CLKOUT0_DIVIDE     => pll_settings.clkout_divide,
-	    DIVCLK_DIVIDE      => pll_settings.divclk_divide,
-	    STARTUP_WAIT       => "FALSE")
-	port map (
-	    CLKOUT0  => pll_clk_out,
-	    CLKOUT1  => open,
-	    CLKOUT2  => open,
-	    CLKOUT3  => open,
-	    CLKOUT4  => open,
-	    CLKOUT5  => open,
-	    CLKFBOUT => clkfb,
-	    LOCKED   => pll_locked_out,
-	    CLKIN1   => ext_clk,
-	    PWRDWN   => '0',
-	    RST      => pll_rst_in or pll_settings.force_rst,
-	    CLKFBIN  => clkfb);
+        generic map (
+            BANDWIDTH          => "OPTIMIZED",
+            CLKFBOUT_MULT      => pll_settings.clkfbout_mult,
+            CLKIN1_PERIOD      => pll_settings.clkin_period,
+            CLKOUT0_DIVIDE     => pll_settings.clkout_divide,
+            DIVCLK_DIVIDE      => pll_settings.divclk_divide,
+            STARTUP_WAIT       => "FALSE")
+        port map (
+            CLKOUT0  => pll_clk_out,
+            CLKOUT1  => open,
+            CLKOUT2  => open,
+            CLKOUT3  => open,
+            CLKOUT4  => open,
+            CLKOUT5  => open,
+            CLKFBOUT => clkfb,
+            LOCKED   => pll_locked_out,
+            CLKIN1   => ext_clk,
+            PWRDWN   => '0',
+            RST      => pll_rst_in or pll_settings.force_rst,
+            CLKFBIN  => clkfb);

 end architecture rtl;
--- a/fpga/cmod_a7-35.xdc
+++ b/fpga/cmod_a7-35.xdc
@ -1,6 +1,6 @@
 ## Clock signal 12 MHz
 set_property -dict { PACKAGE_PIN L17   IOSTANDARD LVCMOS33 } [get_ports { ext_clk }];
-create_clock -add -name sys_clk_pin -period 83.33 -waveform {0 41.66} [get_ports {ext_clk}];
+create_clock -name sys_clk_pin -period 83.33 [get_ports {ext_clk}];

 set_property -dict { PACKAGE_PIN J18   IOSTANDARD LVCMOS33 } [get_ports { uart0_txd }];
 set_property -dict { PACKAGE_PIN J17   IOSTANDARD LVCMOS33 } [get_ports { uart0_rxd  }];
--- a/fpga/genesys2.xdc
+++ b/fpga/genesys2.xdc
@ -3,8 +3,8 @@
 ## Clock & Reset
 set_property -dict { PACKAGE_PIN AD11  IOSTANDARD LVDS     } [get_ports { clk200_n }]
 set_property -dict { PACKAGE_PIN AD12  IOSTANDARD LVDS     } [get_ports { clk200_p }]
-create_clock -period 5.000 -name tc_clk100_p -waveform {0.000 2.500} [get_ports clk200_p]
-create_clock -period 5.000 -name tc_clk100_n -waveform {2.500 5.000} [get_ports clk200_n]
+create_clock -period 5.000 -name tc_clk100_p [get_ports clk200_p]
+create_clock -period 5.000 -name tc_clk100_n [get_ports clk200_n]

 set_property -dict { PACKAGE_PIN R19   IOSTANDARD LVCMOS33 } [get_ports { ext_rst }]

--- a/fpga/main_bram.vhdl
+++ b/fpga/main_bram.vhdl
@ -9,20 +9,20 @@ library work;

 entity main_bram is
    generic(
-	WIDTH        : natural := 64;
-	HEIGHT_BITS  : natural := 1024;
-	MEMORY_SIZE  : natural := 65536;
-	RAM_INIT_FILE : string
-	);
+        WIDTH        : natural := 64;
+        HEIGHT_BITS  : natural := 1024;
+        MEMORY_SIZE  : natural := 65536;
+        RAM_INIT_FILE : string
+        );
    port(
-	clk  : in std_logic;
-	addr : in std_logic_vector(HEIGHT_BITS - 1 downto 0) ;
-	di   : in std_logic_vector(WIDTH-1 downto 0);
-	do   : out std_logic_vector(WIDTH-1 downto 0);
-	sel  : in std_logic_vector((WIDTH/8)-1 downto 0);
-	re   : in std_ulogic;
-	we   : in std_ulogic
-	);
+        clk  : in std_logic;
+        addr : in std_logic_vector(HEIGHT_BITS - 1 downto 0) ;
+        din  : in std_logic_vector(WIDTH-1 downto 0);
+        dout : out std_logic_vector(WIDTH-1 downto 0);
+        sel  : in std_logic_vector((WIDTH/8)-1 downto 0);
+        re   : in std_ulogic;
+        we   : in std_ulogic
+        );
 end entity main_bram;

 architecture behaviour of main_bram is
@ -63,20 +63,20 @@ begin
    -- Actual RAM template    
    memory_0: process(clk)
    begin
-	if rising_edge(clk) then
-	    if we = '1' then
-		for i in 0 to 7 loop
-		    if sel(i) = '1' then
-			memory(to_integer(unsigned(addr)))((i + 1) * 8 - 1 downto i * 8) <=
-			    di((i + 1) * 8 - 1 downto i * 8);
-		    end if;
-		end loop;
-	    end if;
-	    if re = '1' then
-		obuf <= memory(to_integer(unsigned(addr)));
-	    end if;
-	    do <= obuf;
-	end if;
+        if rising_edge(clk) then
+            if we = '1' then
+                for i in 0 to 7 loop
+                    if sel(i) = '1' then
+                        memory(to_integer(unsigned(addr)))((i + 1) * 8 - 1 downto i * 8) <=
+                            din((i + 1) * 8 - 1 downto i * 8);
+                    end if;
+                end loop;
+            end if;
+            if re = '1' then
+                obuf <= memory(to_integer(unsigned(addr)));
+            end if;
+            dout <= obuf;
+        end if;
    end process;

 end architecture behaviour;
--- a/fpga/nexys-video.xdc
+++ b/fpga/nexys-video.xdc
@ -4,7 +4,7 @@

 set_property -dict {PACKAGE_PIN R4 IOSTANDARD LVCMOS33} [get_ports ext_clk]

-set_property -dict {PACKAGE_PIN G4 IOSTANDARD LVCMOS15} [get_ports ext_rst]
+set_property -dict {PACKAGE_PIN G4 IOSTANDARD LVCMOS15} [get_ports ext_rst_n]

 set_property -dict {PACKAGE_PIN AA19 IOSTANDARD LVCMOS33} [get_ports uart_main_tx]
 set_property -dict {PACKAGE_PIN V18 IOSTANDARD LVCMOS33} [get_ports uart_main_rx]
@ -22,8 +22,14 @@ set_property -dict {PACKAGE_PIN V18 IOSTANDARD LVCMOS33} [get_ports uart_main_rx
 # LEDs
 ################################################################################

-set_property -dict { PACKAGE_PIN T14  IOSTANDARD LVCMOS33 } [get_ports { led0 }];
-set_property -dict { PACKAGE_PIN T15  IOSTANDARD LVCMOS33 } [get_ports { led1 }];
+set_property -dict { PACKAGE_PIN T14  IOSTANDARD LVCMOS25 } [get_ports { led0 }];
+set_property -dict { PACKAGE_PIN T15  IOSTANDARD LVCMOS25 } [get_ports { led1 }];
+set_property -dict { PACKAGE_PIN T16  IOSTANDARD LVCMOS25 } [get_ports { led2 }];
+set_property -dict { PACKAGE_PIN U16  IOSTANDARD LVCMOS25 } [get_ports { led3 }];
+set_property -dict { PACKAGE_PIN V15  IOSTANDARD LVCMOS25 } [get_ports { led4 }];
+set_property -dict { PACKAGE_PIN W16  IOSTANDARD LVCMOS25 } [get_ports { led5 }];
+set_property -dict { PACKAGE_PIN W15  IOSTANDARD LVCMOS25 } [get_ports { led6 }];
+set_property -dict { PACKAGE_PIN Y13  IOSTANDARD LVCMOS25 } [get_ports { led7 }];

 ################################################################################
 # SPI Flash
@ -35,6 +41,91 @@ set_property -dict { PACKAGE_PIN R22 IOSTANDARD LVCMOS33 } [get_ports { spi_flas
 set_property -dict { PACKAGE_PIN P21 IOSTANDARD LVCMOS33 } [get_ports { spi_flash_wp_n }];
 set_property -dict { PACKAGE_PIN R21 IOSTANDARD LVCMOS33 } [get_ports { spi_flash_hold_n }];

+################################################################################
+# SD card
+################################################################################
+
+set_property -dict { PACKAGE_PIN W19 IOSTANDARD LVCMOS33 SLEW FAST } [get_ports { sdcard_clk }]
+set_property -dict { PACKAGE_PIN T18 IOSTANDARD LVCMOS33 } [get_ports { sdcard_cd }]
+set_property -dict { PACKAGE_PIN W20 IOSTANDARD LVCMOS33 SLEW FAST } [get_ports { sdcard_cmd }]
+set_property -dict { PACKAGE_PIN V19 IOSTANDARD LVCMOS33 SLEW FAST } [get_ports { sdcard_data[0] }]
+set_property -dict { PACKAGE_PIN T21 IOSTANDARD LVCMOS33 SLEW FAST } [get_ports { sdcard_data[1] }]
+set_property -dict { PACKAGE_PIN T20 IOSTANDARD LVCMOS33 SLEW FAST } [get_ports { sdcard_data[2] }]
+set_property -dict { PACKAGE_PIN U18 IOSTANDARD LVCMOS33 SLEW FAST } [get_ports { sdcard_data[3] }]
+set_property -dict { PACKAGE_PIN V20 IOSTANDARD LVCMOS33 } [get_ports { sdcard_reset }]
+
+# Put registers into IOBs to improve timing
+set_property IOB true [get_cells -hierarchical -filter {NAME =~*.litesdcard/sdcard_*}]
+
+################################################################################
+# Ethernet (generated by LiteX)
+################################################################################
+
+# eth_clocks:0.tx
+set_property LOC AA14 [get_ports {eth_clocks_tx}]
+set_property IOSTANDARD LVCMOS25 [get_ports {eth_clocks_tx}]
+
+# eth_clocks:0.rx
+set_property LOC V13 [get_ports {eth_clocks_rx}]
+set_property IOSTANDARD LVCMOS25 [get_ports {eth_clocks_rx}]
+
+# eth:0.rst_n
+set_property LOC U7 [get_ports {eth_rst_n}]
+set_property IOSTANDARD LVCMOS25 [get_ports {eth_rst_n}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_rst_n}]
+
+# eth:0.int_n
+set_property LOC Y14 [get_ports {eth_int_n}]
+set_property IOSTANDARD LVCMOS25 [get_ports {eth_int_n}]
+
+# eth:0.mdio
+set_property LOC Y16 [get_ports {eth_mdio}]
+set_property IOSTANDARD LVCMOS25 [get_ports {eth_mdio}]
+
+# eth:0.mdc
+set_property LOC AA16 [get_ports {eth_mdc}]
+set_property IOSTANDARD LVCMOS25 [get_ports {eth_mdc}]
+
+# eth:0.rx_ctl
+set_property LOC W10 [get_ports {eth_rx_ctl}]
+set_property IOSTANDARD LVCMOS25 [get_ports {eth_rx_ctl}]
+
+# eth:0.rx_data
+set_property LOC AB16 [get_ports {eth_rx_data[0]}]
+set_property IOSTANDARD LVCMOS25 [get_ports {eth_rx_data[0]}]
+
+# eth:0.rx_data
+set_property LOC AA15 [get_ports {eth_rx_data[1]}]
+set_property IOSTANDARD LVCMOS25 [get_ports {eth_rx_data[1]}]
+
+# eth:0.rx_data
+set_property LOC AB15 [get_ports {eth_rx_data[2]}]
+set_property IOSTANDARD LVCMOS25 [get_ports {eth_rx_data[2]}]
+
+# eth:0.rx_data
+set_property LOC AB11 [get_ports {eth_rx_data[3]}]
+set_property IOSTANDARD LVCMOS25 [get_ports {eth_rx_data[3]}]
+
+# eth:0.tx_ctl
+set_property LOC V10 [get_ports {eth_tx_ctl}]
+set_property IOSTANDARD LVCMOS25 [get_ports {eth_tx_ctl}]
+
+# eth:0.tx_data
+set_property LOC Y12 [get_ports {eth_tx_data[0]}]
+set_property IOSTANDARD LVCMOS25 [get_ports {eth_tx_data[0]}]
+
+# eth:0.tx_data
+set_property LOC W12 [get_ports {eth_tx_data[1]}]
+set_property IOSTANDARD LVCMOS25 [get_ports {eth_tx_data[1]}]
+
+# eth:0.tx_data
+set_property LOC W11 [get_ports {eth_tx_data[2]}]
+set_property IOSTANDARD LVCMOS25 [get_ports {eth_tx_data[2]}]
+
+# eth:0.tx_data
+set_property LOC Y11 [get_ports {eth_tx_data[3]}]
+set_property IOSTANDARD LVCMOS25 [get_ports {eth_tx_data[3]}]
+
 ################################################################################
 # DRAM (generated by LiteX)
 ################################################################################
@ -313,12 +404,18 @@ set_property CONFIG_MODE SPIx4 [current_design]
 # Clock constraints
 ################################################################################

-create_clock -add -name sys_clk_pin -period 10.00 -waveform {0 5} [get_ports { ext_clk }];
+create_clock -name sys_clk_pin -period 10.00 [get_ports { ext_clk }];
+
+create_clock -name eth_clocks_rx -period 8.0 [get_ports { eth_clocks_rx }]
+
+set_clock_groups -asynchronous -group [get_clocks sys_clk_pin -include_generated_clocks] -group [get_clocks eth_clocks_rx -include_generated_clocks]

 ################################################################################
-# False path constraints (from LiteX as they relate to LiteDRAM)
+# False path constraints (from LiteX as they relate to LiteDRAM and LiteEth)
 ################################################################################

+set_false_path -quiet -through [get_nets -hierarchical -filter {mr_ff == TRUE}]
+
 set_false_path -quiet -to [get_pins -filter {REF_PIN_NAME == PRE} -of_objects [get_cells -hierarchical -filter {ars_ff1 == TRUE || ars_ff2 == TRUE}]]

 set_max_delay 2 -quiet -from [get_pins -filter {REF_PIN_NAME == C} -of_objects [get_cells -hierarchical -filter {ars_ff1 == TRUE}]] -to [get_pins -filter {REF_PIN_NAME == D} -of_objects [get_cells -hierarchical -filter {ars_ff2 == TRUE}]]
--- a/fpga/nexys_a7.xdc
+++ b/fpga/nexys_a7.xdc
@ -1,5 +1,5 @@
 set_property -dict {PACKAGE_PIN E3 IOSTANDARD LVCMOS33} [get_ports ext_clk]
-create_clock -period 10.000 -name sys_clk_pin -waveform {0.000 5.000} -add [get_ports ext_clk]
+create_clock -period 10.000 -name sys_clk_pin [get_ports ext_clk]

 set_property -dict {PACKAGE_PIN C12 IOSTANDARD LVCMOS33} [get_ports ext_rst]

--- a/fpga/top-acorn-cle-215.vhdl
+++ b/fpga/top-acorn-cle-215.vhdl
@ -94,6 +94,10 @@ architecture behaviour of toplevel is
    signal spi_sdat_oe : std_ulogic_vector(3 downto 0);
    signal spi_sdat_i  : std_ulogic_vector(3 downto 0);

+    -- ddram clock signals as vectors
+    signal ddram_clk_p_vec : std_logic_vector(0 downto 0);
+    signal ddram_clk_n_vec : std_logic_vector(0 downto 0);
+
    -- Fixup various memory sizes based on generics
    function get_bram_size return natural is
    begin
@ -252,6 +256,9 @@ begin
 	-- but for now, assert it's 100Mhz
 	assert CLK_FREQUENCY = 100000000;

+	ddram_clk_p_vec <= (others => ddram_clk_p);
+	ddram_clk_n_vec <= (others => ddram_clk_n);
+
 	reset_controller: entity work.soc_reset
 	    generic map(
 		RESET_LOW => false,
@ -272,6 +279,7 @@ begin
 		DRAM_ABITS => 26,
 		DRAM_ALINES => 16,
                DRAM_DLINES => 16,
+                DRAM_CKLINES => 1,
                DRAM_PORT_WIDTH => 128,
                PAYLOAD_FILE => RAM_INIT_FILE,
                PAYLOAD_SIZE => PAYLOAD_SIZE
@ -304,8 +312,8 @@ begin
 		ddram_dq	=> ddram_dq,
 		ddram_dqs_p	=> ddram_dqs_p,
 		ddram_dqs_n	=> ddram_dqs_n,
-		ddram_clk_p	=> ddram_clk_p,
-		ddram_clk_n	=> ddram_clk_n,
+		ddram_clk_p	=> ddram_clk_p_vec,
+		ddram_clk_n	=> ddram_clk_n_vec,
 		ddram_cke	=> ddram_cke,
 		ddram_odt	=> ddram_odt,
 		ddram_reset_n	=> ddram_reset_n
--- a/fpga/top-arty.vhdl
+++ b/fpga/top-arty.vhdl
@ -15,6 +15,8 @@ entity toplevel is
        RESET_LOW          : boolean  := true;
        CLK_FREQUENCY      : positive := 100000000;
        HAS_FPU            : boolean  := true;
+        HAS_BTC            : boolean  := true;
+        HAS_SHORT_MULT     : boolean  := false;
        USE_LITEDRAM       : boolean  := false;
        NO_BRAM            : boolean  := false;
        DISABLE_FLATTEN_CORE : boolean := false;
@ -25,7 +27,10 @@ entity toplevel is
        LOG_LENGTH         : natural := 512;
        USE_LITEETH        : boolean  := false;
        UART_IS_16550      : boolean  := false;
-        HAS_UART1          : boolean  := true
+        HAS_UART1          : boolean  := true;
+        USE_LITESDCARD     : boolean := false;
+        HAS_GPIO           : boolean := true;
+        NGPIO              : natural := 32
        );
    port(
        ext_clk   : in  std_ulogic;
@ -35,12 +40,6 @@ entity toplevel is
        uart_main_tx : out std_ulogic;
        uart_main_rx : in  std_ulogic;

-	-- UART1 signals:
-	uart_pmod_tx    : out std_ulogic;
-	uart_pmod_rx    : in std_ulogic;
-	uart_pmod_cts_n : in std_ulogic;
-	uart_pmod_rts_n : out std_ulogic;
-
        -- LEDs
        led0_b  : out std_ulogic;
        led0_g  : out std_ulogic;
@ -58,6 +57,9 @@ entity toplevel is
        spi_flash_wp_n   : inout std_ulogic;
        spi_flash_hold_n : inout std_ulogic;

+        -- GPIO
+        shield_io        : inout std_ulogic_vector(44 downto 0);
+
        -- Ethernet
        eth_ref_clk      : out std_ulogic;
        eth_clocks_tx    : in std_ulogic;
@ -73,6 +75,12 @@ entity toplevel is
        eth_col          : in std_ulogic;
        eth_crs          : in std_ulogic;

+        -- SD card
+        sdcard_data   : inout std_ulogic_vector(3 downto 0);
+        sdcard_cmd    : inout std_ulogic;
+        sdcard_clk    : out   std_ulogic;
+        sdcard_cd     : in    std_ulogic;
+
        -- DRAM wires
        ddram_a       : out std_ulogic_vector(13 downto 0);
        ddram_ba      : out std_ulogic_vector(2 downto 0);
@ -109,6 +117,7 @@ architecture behaviour of toplevel is
    signal wb_ext_is_dram_csr  : std_ulogic;
    signal wb_ext_is_dram_init : std_ulogic;
    signal wb_ext_is_eth       : std_ulogic;
+    signal wb_ext_is_sdcard    : std_ulogic;

    -- DRAM main data wishbone connection
    signal wb_dram_in          : wishbone_master_out;
@ -121,6 +130,16 @@ architecture behaviour of toplevel is
    signal ext_irq_eth         : std_ulogic;
    signal wb_eth_out          : wb_io_slave_out := wb_io_slave_out_init;

+    -- LiteSDCard connection
+    signal ext_irq_sdcard      : std_ulogic := '0';
+    signal wb_sdcard_out       : wb_io_slave_out := wb_io_slave_out_init;
+    signal wb_sddma_out        : wb_io_master_out := wb_io_master_out_init;
+    signal wb_sddma_in         : wb_io_slave_out;
+    signal wb_sddma_nr         : wb_io_master_out;
+    signal wb_sddma_ir         : wb_io_slave_out;
+    -- for conversion from non-pipelined wishbone to pipelined
+    signal wb_sddma_stb_sent   : std_ulogic;
+
    -- Control/status
    signal core_alt_reset : std_ulogic;

@ -139,6 +158,15 @@ architecture behaviour of toplevel is
    signal spi_sdat_oe : std_ulogic_vector(3 downto 0);
    signal spi_sdat_i  : std_ulogic_vector(3 downto 0);

+    -- GPIO
+    signal gpio_in     : std_ulogic_vector(NGPIO - 1 downto 0);
+    signal gpio_out    : std_ulogic_vector(NGPIO - 1 downto 0);
+    signal gpio_dir    : std_ulogic_vector(NGPIO - 1 downto 0);
+
+    -- ddram clock signals as vectors
+    signal ddram_clk_p_vec : std_logic_vector(0 downto 0);
+    signal ddram_clk_n_vec : std_logic_vector(0 downto 0);
+
    -- Fixup various memory sizes based on generics
    function get_bram_size return natural is
    begin
@ -170,6 +198,8 @@ begin
            SIM                => false,
            CLK_FREQ           => CLK_FREQUENCY,
            HAS_FPU            => HAS_FPU,
+            HAS_BTC            => HAS_BTC,
+            HAS_SHORT_MULT     => HAS_SHORT_MULT,
            HAS_DRAM           => USE_LITEDRAM,
            DRAM_SIZE          => 256 * 1024 * 1024,
            DRAM_INIT_SIZE     => PAYLOAD_SIZE,
@ -182,7 +212,10 @@ begin
            LOG_LENGTH         => LOG_LENGTH,
            HAS_LITEETH        => USE_LITEETH,
            UART0_IS_16550     => UART_IS_16550,
-            HAS_UART1          => HAS_UART1
+            HAS_UART1          => HAS_UART1,
+            HAS_SD_CARD        => USE_LITESDCARD,
+            HAS_GPIO           => HAS_GPIO,
+            NGPIO              => NGPIO
            )
        port map (
            -- System signals
@ -194,8 +227,8 @@ begin
            uart0_rxd         => uart_main_rx,

 	    -- UART1 signals
-	    uart1_txd         => uart_pmod_tx,
-	    uart1_rxd         => uart_pmod_rx,
+	    --uart1_txd         => uart_pmod_tx,
+	    --uart1_rxd         => uart_pmod_rx,

            -- SPI signals
            spi_flash_sck     => spi_sck,
@ -204,21 +237,35 @@ begin
            spi_flash_sdat_oe => spi_sdat_oe,
            spi_flash_sdat_i  => spi_sdat_i,

+            -- GPIO signals
+            gpio_in           => gpio_in,
+            gpio_out          => gpio_out,
+            gpio_dir          => gpio_dir,
+
            -- External interrupts
            ext_irq_eth       => ext_irq_eth,
+            ext_irq_sdcard    => ext_irq_sdcard,

            -- DRAM wishbone
            wb_dram_in           => wb_dram_in,
            wb_dram_out          => wb_dram_out,
+
+            -- IO wishbone
            wb_ext_io_in         => wb_ext_io_in,
            wb_ext_io_out        => wb_ext_io_out,
            wb_ext_is_dram_csr   => wb_ext_is_dram_csr,
            wb_ext_is_dram_init  => wb_ext_is_dram_init,
            wb_ext_is_eth        => wb_ext_is_eth,
+            wb_ext_is_sdcard     => wb_ext_is_sdcard,
+
+            -- DMA wishbone
+            wishbone_dma_in      => wb_sddma_in,
+            wishbone_dma_out     => wb_sddma_out,
+
            alt_reset            => core_alt_reset
            );

-    uart_pmod_rts_n <= '0';
+    --uart_pmod_rts_n <= '0';

    -- SPI Flash
    --
@ -339,11 +386,15 @@ begin
            end if;
        end process;

+	ddram_clk_p_vec <= (others => ddram_clk_p);
+	ddram_clk_n_vec <= (others => ddram_clk_n);
+
        dram: entity work.litedram_wrapper
            generic map(
                DRAM_ABITS => 24,
                DRAM_ALINES => 14,
                DRAM_DLINES => 16,
+                DRAM_CKLINES => 1,
                DRAM_PORT_WIDTH => 128,
                PAYLOAD_FILE => RAM_INIT_FILE,
                PAYLOAD_SIZE => PAYLOAD_SIZE
@ -376,8 +427,8 @@ begin
                ddram_dq        => ddram_dq,
                ddram_dqs_p     => ddram_dqs_p,
                ddram_dqs_n     => ddram_dqs_n,
-                ddram_clk_p     => ddram_clk_p,
-                ddram_clk_n     => ddram_clk_n,
+                ddram_clk_p     => ddram_clk_p_vec,
+                ddram_clk_n     => ddram_clk_n_vec,
                ddram_cke       => ddram_cke,
                ddram_odt       => ddram_odt,
                ddram_reset_n   => ddram_reset_n
@ -509,7 +560,7 @@ begin
        wb_eth_cyc <= wb_ext_io_in.cyc and wb_ext_is_eth;

        -- Remove top address bits as liteeth decoder doesn't know about them
-        wb_eth_adr <= x"000" & "000" & wb_ext_io_in.adr(16 downto 2);
+        wb_eth_adr <= x"000" & "000" & wb_ext_io_in.adr(14 downto 0);

        -- LiteETH isn't pipelined
        wb_eth_out.stall <= not wb_eth_out.ack;
@ -521,8 +572,113 @@ begin
        ext_irq_eth    <= '0';
    end generate;

+    -- SD card pmod
+    has_sdcard : if USE_LITESDCARD generate
+        component litesdcard_core port (
+            clk           : in    std_ulogic;
+            rst           : in    std_ulogic;
+            -- wishbone for accessing control registers
+            wb_ctrl_adr   : in    std_ulogic_vector(29 downto 0);
+            wb_ctrl_dat_w : in    std_ulogic_vector(31 downto 0);
+            wb_ctrl_dat_r : out   std_ulogic_vector(31 downto 0);
+            wb_ctrl_sel   : in    std_ulogic_vector(3 downto 0);
+            wb_ctrl_cyc   : in    std_ulogic;
+            wb_ctrl_stb   : in    std_ulogic;
+            wb_ctrl_ack   : out   std_ulogic;
+            wb_ctrl_we    : in    std_ulogic;
+            wb_ctrl_cti   : in    std_ulogic_vector(2 downto 0);
+            wb_ctrl_bte   : in    std_ulogic_vector(1 downto 0);
+            wb_ctrl_err   : out   std_ulogic;
+            -- wishbone for SD card core to use for DMA
+            wb_dma_adr    : out   std_ulogic_vector(29 downto 0);
+            wb_dma_dat_w  : out   std_ulogic_vector(31 downto 0);
+            wb_dma_dat_r  : in    std_ulogic_vector(31 downto 0);
+            wb_dma_sel    : out   std_ulogic_vector(3 downto 0);
+            wb_dma_cyc    : out   std_ulogic;
+            wb_dma_stb    : out   std_ulogic;
+            wb_dma_ack    : in    std_ulogic;
+            wb_dma_we     : out   std_ulogic;
+            wb_dma_cti    : out   std_ulogic_vector(2 downto 0);
+            wb_dma_bte    : out   std_ulogic_vector(1 downto 0);
+            wb_dma_err    : in    std_ulogic;
+            -- connections to SD card
+            sdcard_data   : inout std_ulogic_vector(3 downto 0);
+            sdcard_cmd    : inout std_ulogic;
+            sdcard_clk    : out   std_ulogic;
+            sdcard_cd     : in    std_ulogic;
+            irq           : out   std_ulogic
+            );
+        end component;
+
+        signal wb_sdcard_cyc : std_ulogic;
+        signal wb_sdcard_adr : std_ulogic_vector(29 downto 0);
+
+    begin
+        litesdcard : litesdcard_core
+            port map (
+                clk           => system_clk,
+                rst           => soc_rst,
+                wb_ctrl_adr   => wb_sdcard_adr,
+                wb_ctrl_dat_w => wb_ext_io_in.dat,
+                wb_ctrl_dat_r => wb_sdcard_out.dat,
+                wb_ctrl_sel   => wb_ext_io_in.sel,
+                wb_ctrl_cyc   => wb_sdcard_cyc,
+                wb_ctrl_stb   => wb_ext_io_in.stb,
+                wb_ctrl_ack   => wb_sdcard_out.ack,
+                wb_ctrl_we    => wb_ext_io_in.we,
+                wb_ctrl_cti   => "000",
+                wb_ctrl_bte   => "00",
+                wb_ctrl_err   => open,
+                wb_dma_adr    => wb_sddma_nr.adr,
+                wb_dma_dat_w  => wb_sddma_nr.dat,
+                wb_dma_dat_r  => wb_sddma_ir.dat,
+                wb_dma_sel    => wb_sddma_nr.sel,
+                wb_dma_cyc    => wb_sddma_nr.cyc,
+                wb_dma_stb    => wb_sddma_nr.stb,
+                wb_dma_ack    => wb_sddma_ir.ack,
+                wb_dma_we     => wb_sddma_nr.we,
+                wb_dma_cti    => open,
+                wb_dma_bte    => open,
+                wb_dma_err    => '0',
+                sdcard_data   => sdcard_data,
+                sdcard_cmd    => sdcard_cmd,
+                sdcard_clk    => sdcard_clk,
+                sdcard_cd     => sdcard_cd,
+                irq           => ext_irq_sdcard
+                );
+
+        -- Gate cyc with chip select from SoC
+        wb_sdcard_cyc <= wb_ext_io_in.cyc and wb_ext_is_sdcard;
+
+        wb_sdcard_adr <= x"0000" & wb_ext_io_in.adr(13 downto 0);
+
+        wb_sdcard_out.stall <= not wb_sdcard_out.ack;
+
+        -- Convert non-pipelined DMA wishbone to pipelined by suppressing
+        -- non-acknowledged strobes
+        process(system_clk)
+        begin
+            if rising_edge(system_clk) then
+                wb_sddma_out <= wb_sddma_nr;
+                if wb_sddma_stb_sent = '1' or
+                    (wb_sddma_out.stb = '1' and wb_sddma_in.stall = '0') then
+                    wb_sddma_out.stb <= '0';
+                end if;
+                if wb_sddma_nr.cyc = '0' or wb_sddma_ir.ack = '1' then
+                    wb_sddma_stb_sent <= '0';
+                elsif wb_sddma_in.stall = '0' then
+                    wb_sddma_stb_sent <= wb_sddma_nr.stb;
+                end if;
+                wb_sddma_ir <= wb_sddma_in;
+            end if;
+        end process;
+
+    end generate;
+
    -- Mux WB response on the IO bus
-    wb_ext_io_out <= wb_eth_out when wb_ext_is_eth = '1' else wb_dram_ctrl_out;
+    wb_ext_io_out <= wb_eth_out when wb_ext_is_eth = '1' else
+                     wb_sdcard_out when wb_ext_is_sdcard = '1' else
+                     wb_dram_ctrl_out;

    leds_pwm : process(system_clk)
    begin
@ -543,6 +699,72 @@ begin
    led4 <= system_clk_locked;
    led5 <= eth_clk_locked;
    led6 <= not soc_rst;
-    led7 <= not spi_flash_cs_n;
+
+    -- GPIO
+    gpio_in(0) <= shield_io(0);
+    gpio_in(1) <= shield_io(1);
+    gpio_in(2) <= shield_io(2);
+    gpio_in(3) <= shield_io(3);
+    gpio_in(4) <= shield_io(4);
+    gpio_in(5) <= shield_io(5);
+    gpio_in(6) <= shield_io(6);
+    gpio_in(7) <= shield_io(7);
+    gpio_in(8) <= shield_io(8);
+    gpio_in(9) <= shield_io(9);
+    gpio_in(10) <= shield_io(10);
+    gpio_in(11) <= shield_io(11);
+    gpio_in(12) <= shield_io(12);
+    gpio_in(13) <= shield_io(13);
+    gpio_in(14) <= shield_io(26);
+    gpio_in(15) <= shield_io(27);
+    gpio_in(16) <= shield_io(28);
+    gpio_in(17) <= shield_io(29);
+    gpio_in(18) <= shield_io(30);
+    gpio_in(19) <= shield_io(31);
+    gpio_in(20) <= shield_io(32);
+    gpio_in(21) <= shield_io(33);
+    gpio_in(22) <= shield_io(34);
+    gpio_in(23) <= shield_io(35);
+    gpio_in(24) <= shield_io(36);
+    gpio_in(25) <= shield_io(37);
+    gpio_in(26) <= shield_io(38);
+    gpio_in(27) <= shield_io(39);
+    gpio_in(28) <= shield_io(40);
+    gpio_in(29) <= shield_io(41);
+    gpio_in(30) <= shield_io(43);
+    gpio_in(31) <= shield_io(44);
+
+    shield_io(0) <= gpio_out(0) when gpio_dir(0) = '1' else 'Z';
+    shield_io(1) <= gpio_out(1) when gpio_dir(1) = '1' else 'Z';
+    shield_io(2) <= gpio_out(2) when gpio_dir(2) = '1' else 'Z';
+    shield_io(3) <= gpio_out(3) when gpio_dir(3) = '1' else 'Z';
+    shield_io(4) <= gpio_out(4) when gpio_dir(4) = '1' else 'Z';
+    shield_io(5) <= gpio_out(5) when gpio_dir(5) = '1' else 'Z';
+    shield_io(6) <= gpio_out(6) when gpio_dir(6) = '1' else 'Z';
+    shield_io(7) <= gpio_out(7) when gpio_dir(7) = '1' else 'Z';
+    shield_io(8) <= gpio_out(8) when gpio_dir(8) = '1' else 'Z';
+    shield_io(9) <= gpio_out(9) when gpio_dir(9) = '1' else 'Z';
+    shield_io(10) <= gpio_out(10) when gpio_dir(10) = '1' else 'Z';
+    shield_io(11) <= gpio_out(11) when gpio_dir(11) = '1' else 'Z';
+    shield_io(12) <= gpio_out(12) when gpio_dir(12) = '1' else 'Z';
+    shield_io(13) <= gpio_out(13) when gpio_dir(13) = '1' else 'Z';
+    shield_io(26) <= gpio_out(14) when gpio_dir(14) = '1' else 'Z';
+    shield_io(27) <= gpio_out(15) when gpio_dir(15) = '1' else 'Z';
+    shield_io(28) <= gpio_out(16) when gpio_dir(16) = '1' else 'Z';
+    shield_io(29) <= gpio_out(17) when gpio_dir(17) = '1' else 'Z';
+    shield_io(30) <= gpio_out(18) when gpio_dir(18) = '1' else 'Z';
+    shield_io(31) <= gpio_out(19) when gpio_dir(19) = '1' else 'Z';
+    shield_io(32) <= gpio_out(20) when gpio_dir(20) = '1' else 'Z';
+    shield_io(33) <= gpio_out(21) when gpio_dir(21) = '1' else 'Z';
+    shield_io(34) <= gpio_out(22) when gpio_dir(22) = '1' else 'Z';
+    shield_io(35) <= gpio_out(23) when gpio_dir(23) = '1' else 'Z';
+    shield_io(36) <= gpio_out(24) when gpio_dir(24) = '1' else 'Z';
+    shield_io(37) <= gpio_out(25) when gpio_dir(25) = '1' else 'Z';
+    shield_io(38) <= gpio_out(26) when gpio_dir(26) = '1' else 'Z';
+    shield_io(39) <= gpio_out(27) when gpio_dir(27) = '1' else 'Z';
+    shield_io(40) <= gpio_out(28) when gpio_dir(28) = '1' else 'Z';
+    shield_io(41) <= gpio_out(29) when gpio_dir(29) = '1' else 'Z';
+    shield_io(43) <= gpio_out(30) when gpio_dir(30) = '1' else 'Z';
+    shield_io(44) <= gpio_out(31) when gpio_dir(31) = '1' else 'Z';

 end architecture behaviour;
--- a/fpga/top-generic.vhdl
+++ b/fpga/top-generic.vhdl
@ -12,10 +12,12 @@ entity toplevel is
 	CLK_INPUT     : positive := 100000000;
 	CLK_FREQUENCY : positive := 100000000;
        HAS_FPU       : boolean  := true;
+        HAS_BTC       : boolean  := false;
+        HAS_SHORT_MULT: boolean  := false;
+        ICACHE_NUM_LINES : natural := 64;
        LOG_LENGTH    : natural := 512;
 	DISABLE_FLATTEN_CORE : boolean := false;
-        UART_IS_16550 : boolean  := true;
-	HAS_JTAG      : boolean := true
+        UART_IS_16550 : boolean  := true
 	);
    port(
 	ext_clk   : in  std_ulogic;
@ -23,14 +25,7 @@ entity toplevel is

 	-- UART0 signals:
 	uart0_txd : out std_ulogic;
-	uart0_rxd : in  std_ulogic;
-
-	-- JTAG signals:
-	jtag_tck  : in std_ulogic;
-	jtag_tdi  : in std_ulogic;
-	jtag_tms  : in std_ulogic;
-	jtag_trst : in std_ulogic;
-	jtag_tdo  : out std_ulogic
+	uart0_rxd : in  std_ulogic
 	);
 end entity toplevel;

@ -79,21 +74,18 @@ begin
 	    SIM           => false,
 	    CLK_FREQ      => CLK_FREQUENCY,
            HAS_FPU       => HAS_FPU,
+            HAS_BTC       => HAS_BTC,
+            HAS_SHORT_MULT => HAS_SHORT_MULT,
+	    ICACHE_NUM_LINES => ICACHE_NUM_LINES,
            LOG_LENGTH    => LOG_LENGTH,
 	    DISABLE_FLATTEN_CORE => DISABLE_FLATTEN_CORE,
-            UART0_IS_16550     => UART_IS_16550,
-	    HAS_JTAG           => HAS_JTAG
+            UART0_IS_16550     => UART_IS_16550
 	    )
 	port map (
 	    system_clk        => system_clk,
 	    rst               => soc_rst,
 	    uart0_txd         => uart0_txd,
-	    uart0_rxd         => uart0_rxd,
-	    jtag_tck          => jtag_tck,
-	    jtag_tdi          => jtag_tdi,
-	    jtag_tms          => jtag_tms,
-	    jtag_trst         => jtag_trst,
-	    jtag_tdo          => jtag_tdo
+	    uart0_rxd         => uart0_rxd
 	    );

 end architecture behaviour;
--- a/fpga/top-genesys2.vhdl
+++ b/fpga/top-genesys2.vhdl
@ -97,6 +97,10 @@ architecture behaviour of toplevel is
    signal spi_sdat_oe : std_ulogic_vector(3 downto 0);
    signal spi_sdat_i  : std_ulogic_vector(3 downto 0);

+    -- ddram clock signals as vectors
+    signal ddram_clk_p_vec : std_logic_vector(0 downto 0);
+    signal ddram_clk_n_vec : std_logic_vector(0 downto 0);
+
    -- Fixup various memory sizes based on generics
    function get_bram_size return natural is
    begin
@ -270,11 +274,15 @@ begin
 		rst_out => open
 		);

+	ddram_clk_p_vec <= (others => ddram_clk_p);
+	ddram_clk_n_vec <= (others => ddram_clk_n);
+
 	dram: entity work.litedram_wrapper
 	    generic map(
 		DRAM_ABITS => 25,
 		DRAM_ALINES => 15,
                DRAM_DLINES => 32,
+                DRAM_CKLINES => 1,
                DRAM_PORT_WIDTH => 256,
                PAYLOAD_FILE => RAM_INIT_FILE,
                PAYLOAD_SIZE => PAYLOAD_SIZE
@ -307,8 +315,8 @@ begin
 		ddram_dq	=> ddram_dq,
 		ddram_dqs_p	=> ddram_dqs_p,
 		ddram_dqs_n	=> ddram_dqs_n,
-		ddram_clk_p	=> ddram_clk_p,
-		ddram_clk_n	=> ddram_clk_n,
+		ddram_clk_p	=> ddram_clk_p_vec,
+		ddram_clk_n	=> ddram_clk_n_vec,
 		ddram_cke	=> ddram_cke,
 		ddram_odt	=> ddram_odt,
 		ddram_reset_n	=> ddram_reset_n
--- a/fpga/top-nexys-video.vhdl
+++ b/fpga/top-nexys-video.vhdl
@ -15,6 +15,8 @@ entity toplevel is
 	RESET_LOW     : boolean  := true;
 	CLK_FREQUENCY : positive := 100000000;
        HAS_FPU       : boolean  := true;
+        HAS_BTC       : boolean  := true;
+        HAS_SHORT_MULT: boolean  := false;
 	USE_LITEDRAM  : boolean  := false;
 	NO_BRAM       : boolean  := false;
 	DISABLE_FLATTEN_CORE : boolean := false;
@ -22,19 +24,27 @@ entity toplevel is
        SPI_FLASH_DEF_CKDV : natural := 1;
        SPI_FLASH_DEF_QUAD : boolean := true;
        LOG_LENGTH         : natural := 2048;
-        UART_IS_16550      : boolean := true
+        UART_IS_16550      : boolean := true;
+        USE_LITEETH        : boolean := false;
+        USE_LITESDCARD     : boolean := false
 	);
    port(
 	ext_clk   : in  std_ulogic;
-	ext_rst   : in  std_ulogic;
+        ext_rst_n   : in  std_ulogic;

 	-- UART0 signals:
 	uart_main_tx : out std_ulogic;
 	uart_main_rx : in  std_ulogic;

-	-- LEDs
-	led0	: out std_logic;
-	led1	: out std_logic;
+        -- LEDs
+        led0 : out std_ulogic;
+        led1 : out std_ulogic;
+        led2 : out std_ulogic;
+        led3 : out std_ulogic;
+        led4 : out std_ulogic;
+        led5 : out std_ulogic;
+        led6 : out std_ulogic;
+        led7 : out std_ulogic;

        -- SPI
        spi_flash_cs_n   : out std_ulogic;
@ -43,6 +53,25 @@ entity toplevel is
        spi_flash_wp_n   : inout std_ulogic;
        spi_flash_hold_n : inout std_ulogic;

+        -- Ethernet
+        eth_clocks_tx    : out std_ulogic;
+        eth_clocks_rx    : in std_ulogic;
+        eth_rst_n        : out std_ulogic;
+        eth_int_n        : in std_ulogic;
+        eth_mdio         : inout std_ulogic;
+        eth_mdc          : out std_ulogic;
+        eth_rx_ctl       : in std_ulogic;
+        eth_rx_data      : in std_ulogic_vector(3 downto 0);
+        eth_tx_ctl       : out std_ulogic;
+        eth_tx_data      : out std_ulogic_vector(3 downto 0);
+
+        -- SD card
+        sdcard_data   : inout std_ulogic_vector(3 downto 0);
+        sdcard_cmd    : inout std_ulogic;
+        sdcard_clk    : out   std_ulogic;
+        sdcard_cd     : in    std_ulogic;
+        sdcard_reset  : out   std_ulogic;
+
 	-- DRAM wires
 	ddram_a       : out std_logic_vector(14 downto 0);
 	ddram_ba      : out std_logic_vector(2 downto 0);
@ -68,18 +97,37 @@ architecture behaviour of toplevel is
    signal pll_rst : std_ulogic;

    -- Internal clock signals:
-    signal system_clk : std_ulogic;
+    signal system_clk        : std_ulogic;
    signal system_clk_locked : std_ulogic;

+    -- External IOs from the SoC
+    signal wb_ext_io_in        : wb_io_master_out;
+    signal wb_ext_io_out       : wb_io_slave_out;
+    signal wb_ext_is_dram_csr  : std_ulogic;
+    signal wb_ext_is_dram_init : std_ulogic;
+    signal wb_ext_is_eth       : std_ulogic;
+    signal wb_ext_is_sdcard    : std_ulogic;
+
    -- DRAM main data wishbone connection
    signal wb_dram_in       : wishbone_master_out;
    signal wb_dram_out      : wishbone_slave_out;

    -- DRAM control wishbone connection
-    signal wb_ext_io_in        : wb_io_master_out;
-    signal wb_ext_io_out       : wb_io_slave_out;
-    signal wb_ext_is_dram_csr  : std_ulogic;
-    signal wb_ext_is_dram_init : std_ulogic;
+    signal wb_dram_ctrl_out    : wb_io_slave_out := wb_io_slave_out_init;
+
+    -- LiteEth connection
+    signal ext_irq_eth         : std_ulogic;
+    signal wb_eth_out          : wb_io_slave_out := wb_io_slave_out_init;
+
+    -- LiteSDCard connection
+    signal ext_irq_sdcard      : std_ulogic := '0';
+    signal wb_sdcard_out       : wb_io_slave_out := wb_io_slave_out_init;
+    signal wb_sddma_out        : wb_io_master_out := wb_io_master_out_init;
+    signal wb_sddma_in         : wb_io_slave_out;
+    signal wb_sddma_nr         : wb_io_master_out;
+    signal wb_sddma_ir         : wb_io_slave_out;
+    -- for conversion from non-pipelined wishbone to pipelined
+    signal wb_sddma_stb_sent   : std_ulogic;

    -- Control/status
    signal core_alt_reset : std_ulogic;
@ -91,6 +139,10 @@ architecture behaviour of toplevel is
    signal spi_sdat_oe : std_ulogic_vector(3 downto 0);
    signal spi_sdat_i  : std_ulogic_vector(3 downto 0);

+    -- ddram clock signals as vectors
+    signal ddram_clk_p_vec : std_logic_vector(0 downto 0);
+    signal ddram_clk_n_vec : std_logic_vector(0 downto 0);
+
    -- Fixup various memory sizes based on generics
    function get_bram_size return natural is
    begin
@ -122,6 +174,8 @@ begin
 	    SIM           => false,
 	    CLK_FREQ      => CLK_FREQUENCY,
            HAS_FPU       => HAS_FPU,
+            HAS_BTC       => HAS_BTC,
+            HAS_SHORT_MULT=> HAS_SHORT_MULT,
 	    HAS_DRAM      => USE_LITEDRAM,
 	    DRAM_SIZE     => 512 * 1024 * 1024,
            DRAM_INIT_SIZE => PAYLOAD_SIZE,
@ -132,7 +186,9 @@ begin
            SPI_FLASH_DEF_CKDV => SPI_FLASH_DEF_CKDV,
            SPI_FLASH_DEF_QUAD => SPI_FLASH_DEF_QUAD,
            LOG_LENGTH         => LOG_LENGTH,
-            UART0_IS_16550     => UART_IS_16550
+            UART0_IS_16550     => UART_IS_16550,
+            HAS_LITEETH        => USE_LITEETH,
+            HAS_SD_CARD        => USE_LITESDCARD
 	    )
 	port map (
            -- System signals
@ -150,13 +206,24 @@ begin
            spi_flash_sdat_oe => spi_sdat_oe,
            spi_flash_sdat_i  => spi_sdat_i,

-            -- DRAM wishbone
+            -- External interrupts
+            ext_irq_eth       => ext_irq_eth,
+            ext_irq_sdcard    => ext_irq_sdcard,
+
+            -- IO wishbone
 	    wb_dram_in          => wb_dram_in,
 	    wb_dram_out         => wb_dram_out,
 	    wb_ext_io_in        => wb_ext_io_in,
 	    wb_ext_io_out       => wb_ext_io_out,
 	    wb_ext_is_dram_csr  => wb_ext_is_dram_csr,
 	    wb_ext_is_dram_init => wb_ext_is_dram_init,
+            wb_ext_is_eth       => wb_ext_is_eth,
+            wb_ext_is_sdcard    => wb_ext_is_sdcard,
+
+            -- DMA wishbone
+            wishbone_dma_in     => wb_sddma_in,
+            wishbone_dma_out    => wb_sddma_out,
+
 	    alt_reset           => core_alt_reset
 	    );

@ -196,8 +263,8 @@ begin
 	    port map(
 		ext_clk => ext_clk,
 		pll_clk => system_clk,
-		pll_locked_in => system_clk_locked,
-		ext_rst_in => ext_rst,
+                pll_locked_in => system_clk_locked,
+                ext_rst_in => ext_rst_n,
 		pll_rst_out => pll_rst,
 		rst_out => soc_rst
 		);
@ -216,6 +283,7 @@ begin

 	led0 <= '1';
 	led1 <= not soc_rst;
+        led2 <= '0';
 	core_alt_reset <= '0';

        -- Vivado barfs on those differential signals if left
@ -250,17 +318,31 @@ begin
 	    port map(
 		ext_clk => ext_clk,
 		pll_clk => system_clk,
-		pll_locked_in => '1',
-		ext_rst_in => ext_rst,
+                pll_locked_in => '1',
+                ext_rst_in => ext_rst_n,
 		pll_rst_out => pll_rst,
-		rst_out => open
+                rst_out => open
 		);

+        -- Generate SoC reset
+        soc_rst_gen: process(system_clk)
+        begin
+            if ext_rst_n = '0' then
+                soc_rst <= '1';
+            elsif rising_edge(system_clk) then
+                soc_rst <= dram_sys_rst or not system_clk_locked;
+            end if;
+        end process;
+
+	ddram_clk_p_vec <= (others => ddram_clk_p);
+	ddram_clk_n_vec <= (others => ddram_clk_n);
+
 	dram: entity work.litedram_wrapper
 	    generic map(
 		DRAM_ABITS => 25,
 		DRAM_ALINES => 15,
                DRAM_DLINES => 16,
+                DRAM_CKLINES => 1,
                DRAM_PORT_WIDTH => 128,
                PAYLOAD_FILE => RAM_INIT_FILE,
                PAYLOAD_SIZE => PAYLOAD_SIZE
@ -269,14 +351,14 @@ begin
 		clk_in		=> ext_clk,
 		rst             => pll_rst,
 		system_clk	=> system_clk,
-		system_reset	=> soc_rst,
+                system_reset	=> dram_sys_rst,
                core_alt_reset  => core_alt_reset,
 		pll_locked	=> system_clk_locked,

 		wb_in		=> wb_dram_in,
 		wb_out		=> wb_dram_out,
 		wb_ctrl_in	=> wb_ext_io_in,
-		wb_ctrl_out	=> wb_ext_io_out,
+                wb_ctrl_out	=> wb_dram_ctrl_out,
 		wb_ctrl_is_csr  => wb_ext_is_dram_csr,
 		wb_ctrl_is_init => wb_ext_is_dram_init,

@ -293,15 +375,213 @@ begin
 		ddram_dq	=> ddram_dq,
 		ddram_dqs_p	=> ddram_dqs_p,
 		ddram_dqs_n	=> ddram_dqs_n,
-		ddram_clk_p	=> ddram_clk_p,
-		ddram_clk_n	=> ddram_clk_n,
+		ddram_clk_p	=> ddram_clk_p_vec,
+		ddram_clk_n	=> ddram_clk_n_vec,
 		ddram_cke	=> ddram_cke,
 		ddram_odt	=> ddram_odt,
 		ddram_reset_n	=> ddram_reset_n
 		);

-	led0 <= dram_init_done and not dram_init_error;
+        led0 <= not dram_init_done;
 	led1 <= dram_init_error; -- Make it blink ?
+        led2 <= dram_init_done and not dram_init_error;
+
+    end generate;
+
+    has_liteeth : if USE_LITEETH generate
+
+        component liteeth_core port (
+            sys_clock           : in std_ulogic;
+            sys_reset           : in std_ulogic;
+            rgmii_eth_clocks_tx : out std_ulogic;
+            rgmii_eth_clocks_rx : in std_ulogic;
+            rgmii_eth_rst_n     : out std_ulogic;
+            rgmii_eth_int_n     : in std_ulogic;
+            rgmii_eth_mdio      : inout std_ulogic;
+            rgmii_eth_mdc       : out std_ulogic;
+            rgmii_eth_rx_ctl    : in std_ulogic;
+            rgmii_eth_rx_data   : in std_ulogic_vector(3 downto 0);
+            rgmii_eth_tx_ctl    : out std_ulogic;
+            rgmii_eth_tx_data   : out std_ulogic_vector(3 downto 0);
+            wishbone_adr        : in std_ulogic_vector(29 downto 0);
+            wishbone_dat_w      : in std_ulogic_vector(31 downto 0);
+            wishbone_dat_r      : out std_ulogic_vector(31 downto 0);
+            wishbone_sel        : in std_ulogic_vector(3 downto 0);
+            wishbone_cyc        : in std_ulogic;
+            wishbone_stb        : in std_ulogic;
+            wishbone_ack        : out std_ulogic;
+            wishbone_we         : in std_ulogic;
+            wishbone_cti        : in std_ulogic_vector(2 downto 0);
+            wishbone_bte        : in std_ulogic_vector(1 downto 0);
+            wishbone_err        : out std_ulogic;
+            interrupt           : out std_ulogic
+            );
+        end component;
+
+        signal wb_eth_cyc     : std_ulogic;
+        signal wb_eth_adr     : std_ulogic_vector(29 downto 0);
+
+    begin
+        liteeth :  liteeth_core
+            port map(
+                sys_clock           => system_clk,
+                sys_reset           => soc_rst,
+                rgmii_eth_clocks_tx => eth_clocks_tx,
+                rgmii_eth_clocks_rx => eth_clocks_rx,
+                rgmii_eth_rst_n     => eth_rst_n,
+                rgmii_eth_int_n     => eth_int_n,
+                rgmii_eth_mdio      => eth_mdio,
+                rgmii_eth_mdc       => eth_mdc,
+                rgmii_eth_rx_ctl    => eth_rx_ctl,
+                rgmii_eth_rx_data   => eth_rx_data,
+                rgmii_eth_tx_ctl    => eth_tx_ctl,
+                rgmii_eth_tx_data   => eth_tx_data,
+                wishbone_adr        => wb_eth_adr,
+                wishbone_dat_w      => wb_ext_io_in.dat,
+                wishbone_dat_r      => wb_eth_out.dat,
+                wishbone_sel        => wb_ext_io_in.sel,
+                wishbone_cyc        => wb_eth_cyc,
+                wishbone_stb        => wb_ext_io_in.stb,
+                wishbone_ack        => wb_eth_out.ack,
+                wishbone_we         => wb_ext_io_in.we,
+                wishbone_cti        => "000",
+                wishbone_bte        => "00",
+                wishbone_err        => open,
+                interrupt           => ext_irq_eth
+                );
+
+        -- Gate cyc with "chip select" from soc
+        wb_eth_cyc <= wb_ext_io_in.cyc and wb_ext_is_eth;
+
+        -- Remove top address bits as liteeth decoder doesn't know about them
+        wb_eth_adr <= x"000" & "000" & wb_ext_io_in.adr(14 downto 0);
+
+        -- LiteETH isn't pipelined
+        wb_eth_out.stall <= not wb_eth_out.ack;
+
+    end generate;
+
+    no_liteeth : if not USE_LITEETH generate
+        ext_irq_eth    <= '0';
+    end generate;
+
+    -- SD card
+    has_sdcard : if USE_LITESDCARD generate
+        component litesdcard_core port (
+            clk           : in    std_ulogic;
+            rst           : in    std_ulogic;
+            -- wishbone for accessing control registers
+            wb_ctrl_adr   : in    std_ulogic_vector(29 downto 0);
+            wb_ctrl_dat_w : in    std_ulogic_vector(31 downto 0);
+            wb_ctrl_dat_r : out   std_ulogic_vector(31 downto 0);
+            wb_ctrl_sel   : in    std_ulogic_vector(3 downto 0);
+            wb_ctrl_cyc   : in    std_ulogic;
+            wb_ctrl_stb   : in    std_ulogic;
+            wb_ctrl_ack   : out   std_ulogic;
+            wb_ctrl_we    : in    std_ulogic;
+            wb_ctrl_cti   : in    std_ulogic_vector(2 downto 0);
+            wb_ctrl_bte   : in    std_ulogic_vector(1 downto 0);
+            wb_ctrl_err   : out   std_ulogic;
+            -- wishbone for SD card core to use for DMA
+            wb_dma_adr    : out   std_ulogic_vector(29 downto 0);
+            wb_dma_dat_w  : out   std_ulogic_vector(31 downto 0);
+            wb_dma_dat_r  : in    std_ulogic_vector(31 downto 0);
+            wb_dma_sel    : out   std_ulogic_vector(3 downto 0);
+            wb_dma_cyc    : out   std_ulogic;
+            wb_dma_stb    : out   std_ulogic;
+            wb_dma_ack    : in    std_ulogic;
+            wb_dma_we     : out   std_ulogic;
+            wb_dma_cti    : out   std_ulogic_vector(2 downto 0);
+            wb_dma_bte    : out   std_ulogic_vector(1 downto 0);
+            wb_dma_err    : in    std_ulogic;
+            -- connections to SD card
+            sdcard_data   : inout std_ulogic_vector(3 downto 0);
+            sdcard_cmd    : inout std_ulogic;
+            sdcard_clk    : out   std_ulogic;
+            sdcard_cd     : in    std_ulogic;
+            irq           : out   std_ulogic
+            );
+        end component;
+
+        signal wb_sdcard_cyc : std_ulogic;
+        signal wb_sdcard_adr : std_ulogic_vector(29 downto 0);
+
+    begin
+        litesdcard : litesdcard_core
+            port map (
+                clk           => system_clk,
+                rst           => soc_rst,
+                wb_ctrl_adr   => wb_sdcard_adr,
+                wb_ctrl_dat_w => wb_ext_io_in.dat,
+                wb_ctrl_dat_r => wb_sdcard_out.dat,
+                wb_ctrl_sel   => wb_ext_io_in.sel,
+                wb_ctrl_cyc   => wb_sdcard_cyc,
+                wb_ctrl_stb   => wb_ext_io_in.stb,
+                wb_ctrl_ack   => wb_sdcard_out.ack,
+                wb_ctrl_we    => wb_ext_io_in.we,
+                wb_ctrl_cti   => "000",
+                wb_ctrl_bte   => "00",
+                wb_ctrl_err   => open,
+                wb_dma_adr    => wb_sddma_nr.adr,
+                wb_dma_dat_w  => wb_sddma_nr.dat,
+                wb_dma_dat_r  => wb_sddma_ir.dat,
+                wb_dma_sel    => wb_sddma_nr.sel,
+                wb_dma_cyc    => wb_sddma_nr.cyc,
+                wb_dma_stb    => wb_sddma_nr.stb,
+                wb_dma_ack    => wb_sddma_ir.ack,
+                wb_dma_we     => wb_sddma_nr.we,
+                wb_dma_cti    => open,
+                wb_dma_bte    => open,
+                wb_dma_err    => '0',
+                sdcard_data   => sdcard_data,
+                sdcard_cmd    => sdcard_cmd,
+                sdcard_clk    => sdcard_clk,
+                sdcard_cd     => sdcard_cd,
+                irq           => ext_irq_sdcard
+                );
+
+        -- Gate cyc with chip select from SoC
+        wb_sdcard_cyc <= wb_ext_io_in.cyc and wb_ext_is_sdcard;
+
+        wb_sdcard_adr <= x"0000" & wb_ext_io_in.adr(13 downto 0);
+
+        wb_sdcard_out.stall <= not wb_sdcard_out.ack;
+
+	sdcard_reset <= '0';
+
+        -- Convert non-pipelined DMA wishbone to pipelined by suppressing
+        -- non-acknowledged strobes
+        process(system_clk)
+        begin
+            if rising_edge(system_clk) then
+                wb_sddma_out <= wb_sddma_nr;
+                if wb_sddma_stb_sent = '1' or
+                    (wb_sddma_out.stb = '1' and wb_sddma_in.stall = '0') then
+                    wb_sddma_out.stb <= '0';
+                end if;
+                if wb_sddma_nr.cyc = '0' or wb_sddma_ir.ack = '1' then
+                    wb_sddma_stb_sent <= '0';
+                elsif wb_sddma_in.stall = '0' then
+                    wb_sddma_stb_sent <= wb_sddma_nr.stb;
+                end if;
+                wb_sddma_ir <= wb_sddma_in;
+            end if;
+        end process;

    end generate;
+
+    no_sdcard : if not USE_LITESDCARD generate
+        sdcard_reset <= '1';
+    end generate;
+
+    -- Mux WB response on the IO bus
+    wb_ext_io_out <= wb_eth_out when wb_ext_is_eth = '1' else
+                     wb_sdcard_out when wb_ext_is_sdcard = '1' else
+                     wb_dram_ctrl_out;
+
+    led4 <= system_clk_locked;
+    led5 <= '1';
+    led6 <= not soc_rst;
+    led7 <= '0';
+
 end architecture behaviour;
--- a/fpga/top-orangecrab0.2.vhdl
+++ b/fpga/top-orangecrab0.2.vhdl
@ -0,0 +1,512 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.wishbone_types.all;
+
+entity toplevel is
+    generic (
+        MEMORY_SIZE        : integer  := 16384;
+        RAM_INIT_FILE      : string   := "firmware.hex";
+        RESET_LOW          : boolean  := true;
+        CLK_INPUT          : positive := 100000000;
+        CLK_FREQUENCY      : positive := 100000000;
+        HAS_FPU            : boolean  := true;
+        HAS_BTC            : boolean  := false;
+        USE_LITEDRAM       : boolean  := true;
+        NO_BRAM            : boolean  := true;
+        SCLK_STARTUPE2     : boolean := false;
+        SPI_FLASH_OFFSET   : integer := 4194304;
+        SPI_FLASH_DEF_CKDV : natural := 1;
+        SPI_FLASH_DEF_QUAD : boolean := true;
+        LOG_LENGTH         : natural := 0;
+        UART_IS_16550      : boolean  := true;
+        HAS_UART1          : boolean  := false;
+        USE_LITESDCARD     : boolean := true;
+        ICACHE_NUM_LINES   : natural := 64;
+        NGPIO              : natural := 0
+        );
+    port(
+        ext_clk   : in  std_ulogic;
+        ext_rst_n : in  std_ulogic;
+
+        -- UART0 signals:
+        pin_gpio_0 : out std_ulogic;
+        pin_gpio_1 : in  std_ulogic;
+
+        -- LEDs
+        led0_b  : out std_ulogic;
+        led0_g  : out std_ulogic;
+        led0_r  : out std_ulogic;
+
+        -- SPI
+        spi_flash_cs_n   : out std_ulogic;
+        spi_flash_mosi   : inout std_ulogic;
+        spi_flash_miso   : inout std_ulogic;
+        spi_flash_wp_n   : inout std_ulogic;
+        spi_flash_hold_n : inout std_ulogic;
+
+        -- SD card wires
+        sdcard_data   : inout std_ulogic_vector(3 downto 0);
+        sdcard_cmd    : inout std_ulogic;
+        sdcard_clk    : out   std_ulogic;
+        sdcard_cd     : in    std_ulogic;
+
+        -- DRAM wires
+        ddram_a       : out std_ulogic_vector(13 downto 0);
+        ddram_ba      : out std_ulogic_vector(2 downto 0);
+        ddram_ras_n   : out std_ulogic;
+        ddram_cas_n   : out std_ulogic;
+        ddram_we_n    : out std_ulogic;
+        ddram_cs_n    : out std_ulogic;
+        ddram_dm      : out std_ulogic_vector(1 downto 0);
+        ddram_dq      : inout std_ulogic_vector(15 downto 0);
+        ddram_dqs_p   : inout std_ulogic_vector(1 downto 0);
+        ddram_clk_p   : out std_ulogic_vector(0 downto 0);
+        -- only the positive differential pin is instantiated
+        --ddram_dqs_n   : inout std_ulogic_vector(1 downto 0);
+        --ddram_clk_n   : out std_ulogic_vector(0 downto 0);
+        ddram_cke     : out std_ulogic;
+        ddram_odt     : out std_ulogic;
+        ddram_reset_n : out std_ulogic;
+
+        ddram_gnd      : out std_ulogic_vector(1 downto 0);
+        ddram_vccio    : out std_ulogic_vector(5 downto 0)
+        );
+end entity toplevel;
+
+architecture behaviour of toplevel is
+
+    -- Reset signals:
+    signal soc_rst : std_ulogic;
+    signal pll_rst : std_ulogic;
+
+    -- Internal clock signals:
+    signal system_clk        : std_ulogic;
+    signal system_clk_locked : std_ulogic;
+
+    -- External IOs from the SoC
+    signal wb_ext_io_in        : wb_io_master_out;
+    signal wb_ext_io_out       : wb_io_slave_out;
+    signal wb_ext_is_dram_csr  : std_ulogic;
+    signal wb_ext_is_dram_init : std_ulogic;
+    signal wb_ext_is_sdcard    : std_ulogic;
+
+    -- DRAM main data wishbone connection
+    signal wb_dram_in          : wishbone_master_out;
+    signal wb_dram_out         : wishbone_slave_out;
+
+    -- DRAM control wishbone connection
+    signal wb_dram_ctrl_out    : wb_io_slave_out := wb_io_slave_out_init;
+
+    -- LiteSDCard connection
+    signal ext_irq_sdcard      : std_ulogic := '0';
+    signal wb_sdcard_out       : wb_io_slave_out := wb_io_slave_out_init;
+    signal wb_sddma_out        : wb_io_master_out := wb_io_master_out_init;
+    signal wb_sddma_in         : wb_io_slave_out;
+    signal wb_sddma_nr         : wb_io_master_out;
+    signal wb_sddma_ir         : wb_io_slave_out;
+    -- for conversion from non-pipelined wishbone to pipelined
+    signal wb_sddma_stb_sent   : std_ulogic;
+
+    -- Control/status
+    signal core_alt_reset : std_ulogic;
+
+    -- Status LED
+    signal led0_b_pwm : std_ulogic;
+    signal led0_r_pwm : std_ulogic;
+    signal led0_g_pwm : std_ulogic;
+
+    -- Dumb PWM for the LEDs, those RGB LEDs are too bright otherwise
+    signal pwm_counter  : std_ulogic_vector(8 downto 0);
+
+    -- SPI flash
+    signal spi_sck     : std_ulogic;
+    signal spi_cs_n    : std_ulogic;
+    signal spi_sdat_o  : std_ulogic_vector(3 downto 0);
+    signal spi_sdat_oe : std_ulogic_vector(3 downto 0);
+    signal spi_sdat_i  : std_ulogic_vector(3 downto 0);
+
+    -- GPIO
+    signal gpio_in     : std_ulogic_vector(NGPIO - 1 downto 0);
+    signal gpio_out    : std_ulogic_vector(NGPIO - 1 downto 0);
+    signal gpio_dir    : std_ulogic_vector(NGPIO - 1 downto 0);
+
+    -- Fixup various memory sizes based on generics
+    function get_bram_size return natural is
+    begin
+        if USE_LITEDRAM and NO_BRAM then
+            return 0;
+        else
+            return MEMORY_SIZE;
+        end if;
+    end function;
+
+    function get_payload_size return natural is
+    begin
+        if USE_LITEDRAM and NO_BRAM then
+            return MEMORY_SIZE;
+        else
+            return 0;
+        end if;
+    end function;
+
+    constant BRAM_SIZE    : natural := get_bram_size;
+    constant PAYLOAD_SIZE : natural := get_payload_size;
+
+    COMPONENT USRMCLK
+        PORT(
+            USRMCLKI : IN STD_ULOGIC;
+            USRMCLKTS : IN STD_ULOGIC
+        );
+    END COMPONENT;
+    attribute syn_noprune: boolean ;
+    attribute syn_noprune of USRMCLK: component is true;
+
+begin
+
+    -- Main SoC
+    soc0: entity work.soc
+        generic map(
+            MEMORY_SIZE        => BRAM_SIZE,
+            RAM_INIT_FILE      => RAM_INIT_FILE,
+            SIM                => false,
+            CLK_FREQ           => CLK_FREQUENCY,
+            HAS_FPU            => HAS_FPU,
+            HAS_BTC            => HAS_BTC,
+            HAS_DRAM           => USE_LITEDRAM,
+            DRAM_SIZE          => 256 * 1024 * 1024,
+            DRAM_INIT_SIZE     => PAYLOAD_SIZE,
+            HAS_SPI_FLASH      => true,
+            SPI_FLASH_DLINES   => 4,
+            SPI_FLASH_OFFSET   => SPI_FLASH_OFFSET,
+            SPI_FLASH_DEF_CKDV => SPI_FLASH_DEF_CKDV,
+            SPI_FLASH_DEF_QUAD => SPI_FLASH_DEF_QUAD,
+            LOG_LENGTH         => LOG_LENGTH,
+            UART0_IS_16550     => UART_IS_16550,
+            HAS_UART1          => HAS_UART1,
+            HAS_SD_CARD        => USE_LITESDCARD,
+            ICACHE_NUM_LINES   => ICACHE_NUM_LINES,
+            HAS_SHORT_MULT     => true,
+            NGPIO              => NGPIO
+            )
+        port map (
+            -- System signals
+            system_clk        => system_clk,
+            rst               => soc_rst,
+
+            -- UART signals
+            uart0_txd         => pin_gpio_0,
+            uart0_rxd         => pin_gpio_1,
+
+	    -- UART1 signals
+	    --uart1_txd         => uart_pmod_tx,
+	    --uart1_rxd         => uart_pmod_rx,
+
+            -- SPI signals
+            spi_flash_sck     => spi_sck,
+            spi_flash_cs_n    => spi_cs_n,
+            spi_flash_sdat_o  => spi_sdat_o,
+            spi_flash_sdat_oe => spi_sdat_oe,
+            spi_flash_sdat_i  => spi_sdat_i,
+
+            -- GPIO signals
+            gpio_in           => gpio_in,
+            gpio_out          => gpio_out,
+            gpio_dir          => gpio_dir,
+
+            -- External interrupts
+            ext_irq_sdcard    => ext_irq_sdcard,
+
+            -- DRAM wishbone
+            wb_dram_in           => wb_dram_in,
+            wb_dram_out          => wb_dram_out,
+
+            -- IO wishbone
+            wb_ext_io_in         => wb_ext_io_in,
+            wb_ext_io_out        => wb_ext_io_out,
+            wb_ext_is_dram_csr   => wb_ext_is_dram_csr,
+            wb_ext_is_dram_init  => wb_ext_is_dram_init,
+            wb_ext_is_sdcard     => wb_ext_is_sdcard,
+
+            -- DMA wishbone
+            wishbone_dma_in      => wb_sddma_in,
+            wishbone_dma_out     => wb_sddma_out,
+
+            alt_reset            => core_alt_reset
+            );
+
+    -- SPI Flash
+    --
+    spi_flash_cs_n   <= spi_cs_n;
+    spi_flash_mosi   <= spi_sdat_o(0) when spi_sdat_oe(0) = '1' else 'Z';
+    spi_flash_miso   <= spi_sdat_o(1) when spi_sdat_oe(1) = '1' else 'Z';
+    spi_flash_wp_n   <= spi_sdat_o(2) when spi_sdat_oe(2) = '1' else 'Z';
+    spi_flash_hold_n <= spi_sdat_o(3) when spi_sdat_oe(3) = '1' else 'Z';
+    spi_sdat_i(0)    <= spi_flash_mosi;
+    spi_sdat_i(1)    <= spi_flash_miso;
+    spi_sdat_i(2)    <= spi_flash_wp_n;
+    spi_sdat_i(3)    <= spi_flash_hold_n;
+
+    uclk: USRMCLK port map (
+        USRMCLKI => spi_sck,
+        USRMCLKTS => '0'
+        );
+
+    nodram: if not USE_LITEDRAM generate
+        signal ddram_clk_dummy : std_ulogic;
+    begin
+        reset_controller: entity work.soc_reset
+            generic map(
+                RESET_LOW => RESET_LOW
+                )
+            port map(
+                ext_clk => ext_clk,
+                pll_clk => system_clk,
+                pll_locked_in => system_clk_locked,
+                ext_rst_in => ext_rst_n,
+                pll_rst_out => pll_rst,
+                rst_out => soc_rst
+                );
+
+        clkgen: entity work.clock_generator
+            generic map(
+                CLK_INPUT_HZ => CLK_INPUT,
+                CLK_OUTPUT_HZ => CLK_FREQUENCY
+                )
+            port map(
+                ext_clk => ext_clk,
+                pll_rst_in => pll_rst,
+                pll_clk_out => system_clk,
+                pll_locked_out => system_clk_locked
+                );
+
+        led0_b_pwm <= '1';
+        led0_r_pwm <= '1';
+        led0_g_pwm <= '0';
+        core_alt_reset <= '0';
+
+    end generate;
+
+    has_dram: if USE_LITEDRAM generate
+        signal dram_init_done  : std_ulogic;
+        signal dram_init_error : std_ulogic;
+        signal dram_sys_rst    : std_ulogic;
+        signal rst_gen_rst     : std_ulogic;
+    begin
+
+        -- Eventually dig out the frequency from
+        -- litesdram generate.py sys_clk_freq
+        -- but for now, assert it's 48Mhz for orangecrab
+        assert CLK_FREQUENCY = 48000000;
+
+        reset_controller: entity work.soc_reset
+            generic map(
+                RESET_LOW => RESET_LOW,
+                PLL_RESET_BITS => 18,
+                SOC_RESET_BITS => 1
+                )
+            port map(
+                ext_clk => ext_clk,
+                pll_clk => system_clk,
+                pll_locked_in => system_clk_locked,
+                ext_rst_in => ext_rst_n,
+                pll_rst_out => pll_rst,
+                rst_out => rst_gen_rst
+                );
+
+        -- Generate SoC reset
+        soc_rst_gen: process(system_clk)
+        begin
+            if ext_rst_n = '0' then
+                soc_rst <= '1';
+            elsif rising_edge(system_clk) then
+                soc_rst <= dram_sys_rst or not system_clk_locked;
+            end if;
+        end process;
+
+        dram: entity work.litedram_wrapper
+            generic map(
+                DRAM_ABITS => 24,
+                DRAM_ALINES => 14,
+                DRAM_DLINES => 16,
+                DRAM_CKLINES => 1,
+                DRAM_PORT_WIDTH => 128,
+                NUM_LINES => 8, -- reduce from default of 64 to make smaller/timing
+                PAYLOAD_FILE => RAM_INIT_FILE,
+                PAYLOAD_SIZE => PAYLOAD_SIZE
+                )
+            port map(
+                clk_in          => ext_clk,
+                rst             => pll_rst,
+                system_clk      => system_clk,
+                system_reset    => dram_sys_rst,
+                core_alt_reset  => core_alt_reset,
+                pll_locked      => system_clk_locked,
+
+                wb_in           => wb_dram_in,
+                wb_out          => wb_dram_out,
+                wb_ctrl_in      => wb_ext_io_in,
+                wb_ctrl_out     => wb_dram_ctrl_out,
+                wb_ctrl_is_csr  => wb_ext_is_dram_csr,
+                wb_ctrl_is_init => wb_ext_is_dram_init,
+
+                init_done       => dram_init_done,
+                init_error      => dram_init_error,
+
+                ddram_a         => ddram_a,
+                ddram_ba        => ddram_ba,
+                ddram_ras_n     => ddram_ras_n,
+                ddram_cas_n     => ddram_cas_n,
+                ddram_we_n      => ddram_we_n,
+                ddram_cs_n      => ddram_cs_n,
+                ddram_dm        => ddram_dm,
+                ddram_dq        => ddram_dq,
+                ddram_dqs_p     => ddram_dqs_p,
+                ddram_clk_p     => ddram_clk_p,
+                -- only the positive differential pin is instantiated
+                --ddram_dqs_n     => ddram_dqs_n,
+                --ddram_clk_n     => ddram_clk_n,
+                ddram_cke       => ddram_cke,
+                ddram_odt       => ddram_odt,
+
+                ddram_reset_n   => ddram_reset_n
+                );
+
+        ddram_gnd <= "00";
+        -- for power consumption.
+        -- https://github.com/orangecrab-fpga/orangecrab-hardware/issues/19#issuecomment-683479378
+        ddram_vccio <= "111111";
+
+        led0_b_pwm <= not dram_init_done;
+        led0_r_pwm <= dram_init_error;
+        led0_g_pwm <= dram_init_done and not dram_init_error;
+
+    end generate;
+
+
+    -- SD card pmod
+    has_sdcard : if USE_LITESDCARD generate
+        component litesdcard_core port (
+            clk           : in    std_ulogic;
+            rst           : in    std_ulogic;
+            -- wishbone for accessing control registers
+            wb_ctrl_adr   : in    std_ulogic_vector(29 downto 0);
+            wb_ctrl_dat_w : in    std_ulogic_vector(31 downto 0);
+            wb_ctrl_dat_r : out   std_ulogic_vector(31 downto 0);
+            wb_ctrl_sel   : in    std_ulogic_vector(3 downto 0);
+            wb_ctrl_cyc   : in    std_ulogic;
+            wb_ctrl_stb   : in    std_ulogic;
+            wb_ctrl_ack   : out   std_ulogic;
+            wb_ctrl_we    : in    std_ulogic;
+            wb_ctrl_cti   : in    std_ulogic_vector(2 downto 0);
+            wb_ctrl_bte   : in    std_ulogic_vector(1 downto 0);
+            wb_ctrl_err   : out   std_ulogic;
+            -- wishbone for SD card core to use for DMA
+            wb_dma_adr    : out   std_ulogic_vector(29 downto 0);
+            wb_dma_dat_w  : out   std_ulogic_vector(31 downto 0);
+            wb_dma_dat_r  : in    std_ulogic_vector(31 downto 0);
+            wb_dma_sel    : out   std_ulogic_vector(3 downto 0);
+            wb_dma_cyc    : out   std_ulogic;
+            wb_dma_stb    : out   std_ulogic;
+            wb_dma_ack    : in    std_ulogic;
+            wb_dma_we     : out   std_ulogic;
+            wb_dma_cti    : out   std_ulogic_vector(2 downto 0);
+            wb_dma_bte    : out   std_ulogic_vector(1 downto 0);
+            wb_dma_err    : in    std_ulogic;
+            -- connections to SD card
+            sdcard_data   : inout std_ulogic_vector(3 downto 0);
+            sdcard_cmd    : inout std_ulogic;
+            sdcard_clk    : out   std_ulogic;
+            sdcard_cd     : in    std_ulogic;
+            irq           : out   std_ulogic
+            );
+        end component;
+
+        signal wb_sdcard_cyc : std_ulogic;
+        signal wb_sdcard_adr : std_ulogic_vector(29 downto 0);
+
+    begin
+        litesdcard : litesdcard_core
+            port map (
+                clk           => system_clk,
+                rst           => soc_rst,
+                wb_ctrl_adr   => wb_sdcard_adr,
+                wb_ctrl_dat_w => wb_ext_io_in.dat,
+                wb_ctrl_dat_r => wb_sdcard_out.dat,
+                wb_ctrl_sel   => wb_ext_io_in.sel,
+                wb_ctrl_cyc   => wb_sdcard_cyc,
+                wb_ctrl_stb   => wb_ext_io_in.stb,
+                wb_ctrl_ack   => wb_sdcard_out.ack,
+                wb_ctrl_we    => wb_ext_io_in.we,
+                wb_ctrl_cti   => "000",
+                wb_ctrl_bte   => "00",
+                wb_ctrl_err   => open,
+                wb_dma_adr    => wb_sddma_nr.adr,
+                wb_dma_dat_w  => wb_sddma_nr.dat,
+                wb_dma_dat_r  => wb_sddma_ir.dat,
+                wb_dma_sel    => wb_sddma_nr.sel,
+                wb_dma_cyc    => wb_sddma_nr.cyc,
+                wb_dma_stb    => wb_sddma_nr.stb,
+                wb_dma_ack    => wb_sddma_ir.ack,
+                wb_dma_we     => wb_sddma_nr.we,
+                wb_dma_cti    => open,
+                wb_dma_bte    => open,
+                wb_dma_err    => '0',
+                sdcard_data   => sdcard_data,
+                sdcard_cmd    => sdcard_cmd,
+                sdcard_clk    => sdcard_clk,
+                sdcard_cd     => sdcard_cd,
+                irq           => ext_irq_sdcard
+                );
+
+        -- Gate cyc with chip select from SoC
+        wb_sdcard_cyc <= wb_ext_io_in.cyc and wb_ext_is_sdcard;
+
+        wb_sdcard_adr <= x"0000" & wb_ext_io_in.adr(13 downto 0);
+
+        wb_sdcard_out.stall <= not wb_sdcard_out.ack;
+
+        -- Convert non-pipelined DMA wishbone to pipelined by suppressing
+        -- non-acknowledged strobes
+        process(system_clk)
+        begin
+            if rising_edge(system_clk) then
+                wb_sddma_out <= wb_sddma_nr;
+                if wb_sddma_stb_sent = '1' or
+                    (wb_sddma_out.stb = '1' and wb_sddma_in.stall = '0') then
+                    wb_sddma_out.stb <= '0';
+                end if;
+                if wb_sddma_nr.cyc = '0' or wb_sddma_ir.ack = '1' then
+                    wb_sddma_stb_sent <= '0';
+                elsif wb_sddma_in.stall = '0' then
+                    wb_sddma_stb_sent <= wb_sddma_nr.stb;
+                end if;
+                wb_sddma_ir <= wb_sddma_in;
+            end if;
+        end process;
+
+    end generate;
+
+    -- Mux WB response on the IO bus
+    wb_ext_io_out <= wb_sdcard_out when wb_ext_is_sdcard = '1' else
+                     wb_dram_ctrl_out;
+
+    leds_pwm : process(system_clk)
+    begin
+        if rising_edge(system_clk) then
+            pwm_counter <= std_ulogic_vector(signed(pwm_counter) + 1);
+            if pwm_counter(8 downto 4) = "00000" then
+                led0_b <= led0_b_pwm;
+                led0_r <= led0_r_pwm;
+                led0_g <= led0_g_pwm;
+            else
+                led0_b <= '0';
+                led0_r <= '0';
+                led0_g <= '0';
+            end if;
+        end if;
+    end process;
+
+end architecture behaviour;
--- a/fpga/top-wukong-v2.vhdl
+++ b/fpga/top-wukong-v2.vhdl
@ -0,0 +1,587 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library unisim;
+use unisim.vcomponents.all;
+
+library work;
+use work.wishbone_types.all;
+
+entity toplevel is
+    generic (
+        MEMORY_SIZE        : integer  := 16384;
+        RAM_INIT_FILE      : string   := "firmware.hex";
+        RESET_LOW          : boolean  := true;
+        CLK_FREQUENCY      : positive := 100000000;
+        HAS_FPU            : boolean  := true;
+        HAS_BTC            : boolean  := true;
+        HAS_SHORT_MULT     : boolean  := false;
+        USE_LITEDRAM       : boolean  := false;
+        NO_BRAM            : boolean  := false;
+        DISABLE_FLATTEN_CORE : boolean := false;
+        SPI_FLASH_OFFSET   : integer := 4194304;
+        SPI_FLASH_DEF_CKDV : natural := 1;
+        SPI_FLASH_DEF_QUAD : boolean := true;
+        LOG_LENGTH         : natural := 512;
+        USE_LITEETH        : boolean  := false;
+        UART_IS_16550      : boolean  := true;
+        HAS_UART1          : boolean  := false;
+        USE_LITESDCARD     : boolean := false;
+        HAS_GPIO           : boolean := false;
+        NGPIO              : natural := 32
+        );
+    port(
+        ext_clk   : in  std_ulogic;
+        ext_rst_n : in  std_ulogic;
+
+        -- UART0 signals:
+        uart_main_tx : out std_ulogic;
+        uart_main_rx : in  std_ulogic;
+
+        -- LEDs
+        led0_n  : out std_ulogic;
+        led1_n  : out std_ulogic;
+
+        -- SPI
+        spi_flash_cs_n   : out std_ulogic;
+        spi_flash_mosi   : inout std_ulogic;
+        spi_flash_miso   : inout std_ulogic;
+        spi_flash_wp_n   : inout std_ulogic;
+        spi_flash_hold_n : inout std_ulogic;
+
+        -- Ethernet
+        eth_clocks_tx    : in std_ulogic;
+        eth_clocks_gtx   : out std_ulogic;
+        eth_clocks_rx    : in std_ulogic;
+        eth_rst_n        : out std_ulogic;
+        eth_mdio         : inout std_ulogic;
+        eth_mdc          : out std_ulogic;
+        eth_rx_dv        : in std_ulogic;
+        eth_rx_er        : in std_ulogic;
+        eth_rx_data      : in std_ulogic_vector(7 downto 0);
+        eth_tx_en        : out std_ulogic;
+        eth_tx_er        : out std_ulogic;
+        eth_tx_data      : out std_ulogic_vector(7 downto 0);
+        eth_col          : in std_ulogic;
+        eth_crs          : in std_ulogic;
+
+        -- SD card
+        sdcard_data   : inout std_ulogic_vector(3 downto 0);
+        sdcard_cmd    : inout std_ulogic;
+        sdcard_clk    : out   std_ulogic;
+        sdcard_cd     : in    std_ulogic;
+
+        -- DRAM wires
+        ddram_a       : out std_ulogic_vector(13 downto 0);
+        ddram_ba      : out std_ulogic_vector(2 downto 0);
+        ddram_ras_n   : out std_ulogic;
+        ddram_cas_n   : out std_ulogic;
+        ddram_we_n    : out std_ulogic;
+        ddram_dm      : out std_ulogic_vector(1 downto 0);
+        ddram_dq      : inout std_ulogic_vector(15 downto 0);
+        ddram_dqs_p   : inout std_ulogic_vector(1 downto 0);
+        ddram_dqs_n   : inout std_ulogic_vector(1 downto 0);
+        ddram_clk_p   : out std_ulogic;
+        ddram_clk_n   : out std_ulogic;
+        ddram_cke     : out std_ulogic;
+        ddram_odt     : out std_ulogic;
+        ddram_reset_n : out std_ulogic
+        );
+end entity toplevel;
+
+architecture behaviour of toplevel is
+
+    -- Reset signals:
+    signal soc_rst : std_ulogic;
+    signal pll_rst : std_ulogic;
+
+    -- Internal clock signals:
+    signal system_clk        : std_ulogic;
+    signal system_clk_locked : std_ulogic;
+
+    -- External IOs from the SoC
+    signal wb_ext_io_in        : wb_io_master_out;
+    signal wb_ext_io_out       : wb_io_slave_out;
+    signal wb_ext_is_dram_csr  : std_ulogic;
+    signal wb_ext_is_dram_init : std_ulogic;
+    signal wb_ext_is_eth       : std_ulogic;
+    signal wb_ext_is_sdcard    : std_ulogic;
+
+    -- DRAM main data wishbone connection
+    signal wb_dram_in          : wishbone_master_out;
+    signal wb_dram_out         : wishbone_slave_out;
+
+    -- DRAM control wishbone connection
+    signal wb_dram_ctrl_out    : wb_io_slave_out := wb_io_slave_out_init;
+
+    -- LiteEth connection
+    signal ext_irq_eth         : std_ulogic;
+    signal wb_eth_out          : wb_io_slave_out := wb_io_slave_out_init;
+
+    -- LiteSDCard connection
+    signal ext_irq_sdcard      : std_ulogic := '0';
+    signal wb_sdcard_out       : wb_io_slave_out := wb_io_slave_out_init;
+    signal wb_sddma_out        : wb_io_master_out := wb_io_master_out_init;
+    signal wb_sddma_in         : wb_io_slave_out;
+    signal wb_sddma_nr         : wb_io_master_out;
+    signal wb_sddma_ir         : wb_io_slave_out;
+    -- for conversion from non-pipelined wishbone to pipelined
+    signal wb_sddma_stb_sent   : std_ulogic;
+
+    -- Control/status
+    signal core_alt_reset : std_ulogic;
+
+    -- SPI flash
+    signal spi_sck     : std_ulogic;
+    signal spi_cs_n    : std_ulogic;
+    signal spi_sdat_o  : std_ulogic_vector(3 downto 0);
+    signal spi_sdat_oe : std_ulogic_vector(3 downto 0);
+    signal spi_sdat_i  : std_ulogic_vector(3 downto 0);
+
+    -- ddram clock signals as vectors
+    signal ddram_clk_p_vec : std_ulogic_vector(0 downto 0);
+    signal ddram_clk_n_vec : std_ulogic_vector(0 downto 0);
+
+    -- Fixup various memory sizes based on generics
+    function get_bram_size return natural is
+    begin
+        if USE_LITEDRAM and NO_BRAM then
+            return 0;
+        else
+            return MEMORY_SIZE;
+        end if;
+    end function;
+
+    function get_payload_size return natural is
+    begin
+        if USE_LITEDRAM and NO_BRAM then
+            return MEMORY_SIZE;
+        else
+            return 0;
+        end if;
+    end function;
+    
+    constant BRAM_SIZE    : natural := get_bram_size;
+    constant PAYLOAD_SIZE : natural := get_payload_size;
+begin
+
+    -- Main SoC
+    soc0: entity work.soc
+        generic map(
+            MEMORY_SIZE        => BRAM_SIZE,
+            RAM_INIT_FILE      => RAM_INIT_FILE,
+            SIM                => false,
+            CLK_FREQ           => CLK_FREQUENCY,
+            HAS_FPU            => HAS_FPU,
+            HAS_BTC            => HAS_BTC,
+            HAS_SHORT_MULT     => HAS_SHORT_MULT,
+            HAS_DRAM           => USE_LITEDRAM,
+            DRAM_SIZE          => 256 * 1024 * 1024,
+            DRAM_INIT_SIZE     => PAYLOAD_SIZE,
+            DISABLE_FLATTEN_CORE => DISABLE_FLATTEN_CORE,
+            HAS_SPI_FLASH      => true,
+            SPI_FLASH_DLINES   => 4,
+            SPI_FLASH_OFFSET   => SPI_FLASH_OFFSET,
+            SPI_FLASH_DEF_CKDV => SPI_FLASH_DEF_CKDV,
+            SPI_FLASH_DEF_QUAD => SPI_FLASH_DEF_QUAD,
+            LOG_LENGTH         => LOG_LENGTH,
+            HAS_LITEETH        => USE_LITEETH,
+            UART0_IS_16550     => UART_IS_16550,
+            HAS_UART1          => HAS_UART1,
+            HAS_SD_CARD        => USE_LITESDCARD,
+            HAS_GPIO           => HAS_GPIO,
+            NGPIO              => NGPIO
+            )
+        port map (
+            -- System signals
+            system_clk        => system_clk,
+            rst               => soc_rst,
+
+            -- UART signals
+            uart0_txd         => uart_main_tx,
+            uart0_rxd         => uart_main_rx,
+
+            -- SPI signals
+            spi_flash_sck     => spi_sck,
+            spi_flash_cs_n    => spi_cs_n,
+            spi_flash_sdat_o  => spi_sdat_o,
+            spi_flash_sdat_oe => spi_sdat_oe,
+            spi_flash_sdat_i  => spi_sdat_i,
+
+            -- External interrupts
+            ext_irq_eth       => ext_irq_eth,
+            ext_irq_sdcard    => ext_irq_sdcard,
+
+            -- DRAM wishbone
+            wb_dram_in           => wb_dram_in,
+            wb_dram_out          => wb_dram_out,
+
+            -- IO wishbone
+            wb_ext_io_in         => wb_ext_io_in,
+            wb_ext_io_out        => wb_ext_io_out,
+            wb_ext_is_dram_csr   => wb_ext_is_dram_csr,
+            wb_ext_is_dram_init  => wb_ext_is_dram_init,
+            wb_ext_is_eth        => wb_ext_is_eth,
+            wb_ext_is_sdcard     => wb_ext_is_sdcard,
+
+            -- DMA wishbone
+            wishbone_dma_in      => wb_sddma_in,
+            wishbone_dma_out     => wb_sddma_out,
+
+            alt_reset            => core_alt_reset
+            );
+
+    -- SPI Flash
+    spi_flash_cs_n   <= spi_cs_n;
+    spi_flash_mosi   <= spi_sdat_o(0) when spi_sdat_oe(0) = '1' else 'Z';
+    spi_flash_miso   <= spi_sdat_o(1) when spi_sdat_oe(1) = '1' else 'Z';
+    spi_flash_wp_n   <= spi_sdat_o(2) when spi_sdat_oe(2) = '1' else 'Z';
+    spi_flash_hold_n <= spi_sdat_o(3) when spi_sdat_oe(3) = '1' else 'Z';
+    spi_sdat_i(0)    <= spi_flash_mosi;
+    spi_sdat_i(1)    <= spi_flash_miso;
+    spi_sdat_i(2)    <= spi_flash_wp_n;
+    spi_sdat_i(3)    <= spi_flash_hold_n;
+
+    STARTUPE2_INST: STARTUPE2
+        port map (
+            CLK => '0',
+            GSR => '0',
+            GTS => '0',
+            KEYCLEARB => '0',
+            PACK => '0',
+            USRCCLKO => spi_sck,
+            USRCCLKTS => '0',
+            USRDONEO => '1',
+            USRDONETS => '0'
+            );
+
+    nodram: if not USE_LITEDRAM generate
+        signal ddram_clk_dummy : std_ulogic;
+    begin
+        reset_controller: entity work.soc_reset
+            generic map(
+                RESET_LOW => RESET_LOW
+                )
+            port map(
+                ext_clk => ext_clk,
+                pll_clk => system_clk,
+                pll_locked_in => system_clk_locked,
+                ext_rst_in => ext_rst_n,
+                pll_rst_out => pll_rst,
+                rst_out => soc_rst
+                );
+
+        clkgen: entity work.clock_generator
+            generic map(
+                CLK_INPUT_HZ => 50000000,
+                CLK_OUTPUT_HZ => CLK_FREQUENCY
+                )
+            port map(
+                ext_clk => ext_clk,
+                pll_rst_in => pll_rst,
+                pll_clk_out => system_clk,
+                pll_locked_out => system_clk_locked
+                );
+
+        core_alt_reset <= '0';
+
+        -- Vivado barfs on those differential signals if left
+        -- unconnected. So instanciate a diff. buffer and feed
+        -- it a constant '0'.
+        dummy_dram_clk: OBUFDS
+            port map (
+                O => ddram_clk_p,
+                OB => ddram_clk_n,
+                I => ddram_clk_dummy
+                );
+        ddram_clk_dummy <= '0';
+
+    end generate;
+
+    has_dram: if USE_LITEDRAM generate
+        signal dram_init_done  : std_ulogic;
+        signal dram_init_error : std_ulogic;
+        signal dram_sys_rst    : std_ulogic;
+        signal rst_gen_rst     : std_ulogic;
+    begin
+
+        -- Eventually dig out the frequency from the generator
+        -- but for now, assert it's 100Mhz
+        assert CLK_FREQUENCY = 100000000;
+
+        reset_controller: entity work.soc_reset
+            generic map(
+                RESET_LOW => RESET_LOW,
+                PLL_RESET_BITS => 18,
+                SOC_RESET_BITS => 1
+                )
+            port map(
+                ext_clk => ext_clk,
+                pll_clk => system_clk,
+                pll_locked_in => system_clk_locked,
+                ext_rst_in => ext_rst_n,
+                pll_rst_out => pll_rst,
+                rst_out => rst_gen_rst
+                );
+
+        -- Generate SoC reset
+        soc_rst_gen: process(system_clk)
+        begin
+            if ext_rst_n = '0' then
+                soc_rst <= '1';
+            elsif rising_edge(system_clk) then
+                soc_rst <= dram_sys_rst or not system_clk_locked;
+            end if;
+        end process;
+
+	ddram_clk_p_vec <= (others => ddram_clk_p);
+	ddram_clk_n_vec <= (others => ddram_clk_n);
+
+        dram: entity work.litedram_wrapper
+            generic map(
+                DRAM_ABITS => 24,
+                DRAM_ALINES => 14,
+                DRAM_DLINES => 16,
+                DRAM_CKLINES => 1,
+                DRAM_PORT_WIDTH => 128,
+                PAYLOAD_FILE => RAM_INIT_FILE,
+                PAYLOAD_SIZE => PAYLOAD_SIZE
+                )
+            port map(
+                clk_in          => ext_clk,
+                rst             => pll_rst,
+                system_clk      => system_clk,
+                system_reset    => dram_sys_rst,
+                core_alt_reset  => core_alt_reset,
+                pll_locked      => system_clk_locked,
+
+                wb_in           => wb_dram_in,
+                wb_out          => wb_dram_out,
+                wb_ctrl_in      => wb_ext_io_in,
+                wb_ctrl_out     => wb_dram_ctrl_out,
+                wb_ctrl_is_csr  => wb_ext_is_dram_csr,
+                wb_ctrl_is_init => wb_ext_is_dram_init,
+
+                init_done       => dram_init_done,
+                init_error      => dram_init_error,
+
+                ddram_a         => ddram_a,
+                ddram_ba        => ddram_ba,
+                ddram_ras_n     => ddram_ras_n,
+                ddram_cas_n     => ddram_cas_n,
+                ddram_we_n      => ddram_we_n,
+		ddram_cs_n	=> open,
+                ddram_dm        => ddram_dm,
+                ddram_dq        => ddram_dq,
+                ddram_dqs_p     => ddram_dqs_p,
+                ddram_dqs_n     => ddram_dqs_n,
+                ddram_clk_p     => ddram_clk_p_vec,
+                ddram_clk_n     => ddram_clk_n_vec,
+                ddram_cke       => ddram_cke,
+                ddram_odt       => ddram_odt,
+                ddram_reset_n   => ddram_reset_n
+                );
+
+    end generate;
+
+    has_liteeth : if USE_LITEETH generate
+
+        component liteeth_core port (
+            sys_clock           : in std_ulogic;
+            sys_reset           : in std_ulogic;
+            gmii_eth_clocks_tx  : in std_ulogic;
+            gmii_eth_clocks_gtx : out std_ulogic;
+            gmii_eth_clocks_rx  : in std_ulogic;
+            gmii_eth_rst_n      : out std_ulogic;
+            gmii_eth_mdio       : inout std_ulogic;
+            gmii_eth_mdc        : out std_ulogic;
+            gmii_eth_rx_dv      : in std_ulogic;
+            gmii_eth_rx_er      : in std_ulogic;
+            gmii_eth_rx_data    : in std_ulogic_vector(7 downto 0);
+            gmii_eth_tx_en      : out std_ulogic;
+            gmii_eth_tx_er      : out std_ulogic;
+            gmii_eth_tx_data    : out std_ulogic_vector(7 downto 0);
+            gmii_eth_col        : in std_ulogic;
+            gmii_eth_crs        : in std_ulogic;
+            wishbone_adr        : in std_ulogic_vector(29 downto 0);
+            wishbone_dat_w      : in std_ulogic_vector(31 downto 0);
+            wishbone_dat_r      : out std_ulogic_vector(31 downto 0);
+            wishbone_sel        : in std_ulogic_vector(3 downto 0);
+            wishbone_cyc        : in std_ulogic;
+            wishbone_stb        : in std_ulogic;
+            wishbone_ack        : out std_ulogic;
+            wishbone_we         : in std_ulogic;
+            wishbone_cti        : in std_ulogic_vector(2 downto 0);
+            wishbone_bte        : in std_ulogic_vector(1 downto 0);
+            wishbone_err        : out std_ulogic;
+            interrupt           : out std_ulogic
+            );
+        end component;
+
+        signal wb_eth_cyc     : std_ulogic;
+        signal wb_eth_adr     : std_ulogic_vector(29 downto 0);
+
+        -- Change this to use a PLL instead of a BUFR to generate the 25Mhz
+        -- reference clock to the PHY.
+        constant USE_PLL : boolean := false;
+    begin
+        liteeth :  liteeth_core
+            port map(
+                sys_clock           => system_clk,
+                sys_reset           => soc_rst,
+                gmii_eth_clocks_tx  => eth_clocks_tx,
+                gmii_eth_clocks_gtx => eth_clocks_gtx,
+                gmii_eth_clocks_rx  => eth_clocks_rx,
+                gmii_eth_rst_n      => eth_rst_n,
+                gmii_eth_mdio       => eth_mdio,
+                gmii_eth_mdc        => eth_mdc,
+                gmii_eth_rx_dv      => eth_rx_dv,
+                gmii_eth_rx_er      => eth_rx_er,
+                gmii_eth_rx_data    => eth_rx_data,
+                gmii_eth_tx_en      => eth_tx_en,
+                gmii_eth_tx_er      => eth_tx_er,
+                gmii_eth_tx_data    => eth_tx_data,
+                gmii_eth_col        => eth_col,
+                gmii_eth_crs        => eth_crs,
+                wishbone_adr        => wb_eth_adr,
+                wishbone_dat_w      => wb_ext_io_in.dat,
+                wishbone_dat_r      => wb_eth_out.dat,
+                wishbone_sel        => wb_ext_io_in.sel,
+                wishbone_cyc        => wb_eth_cyc,
+                wishbone_stb        => wb_ext_io_in.stb,
+                wishbone_ack        => wb_eth_out.ack,
+                wishbone_we         => wb_ext_io_in.we,
+                wishbone_cti        => "000",
+                wishbone_bte        => "00",
+                wishbone_err        => open,
+                interrupt           => ext_irq_eth
+                );
+
+        -- Gate cyc with "chip select" from soc
+        wb_eth_cyc <= wb_ext_io_in.cyc and wb_ext_is_eth;
+
+        -- Remove top address bits as liteeth decoder doesn't know about them
+        wb_eth_adr <= x"000" & "000" & wb_ext_io_in.adr(14 downto 0);
+
+        -- LiteETH isn't pipelined
+        wb_eth_out.stall <= not wb_eth_out.ack;
+
+    end generate;
+
+    no_liteeth : if not USE_LITEETH generate
+        ext_irq_eth    <= '0';
+    end generate;
+
+    -- SD card pmod
+    has_sdcard : if USE_LITESDCARD generate
+        component litesdcard_core port (
+            clk           : in    std_ulogic;
+            rst           : in    std_ulogic;
+            -- wishbone for accessing control registers
+            wb_ctrl_adr   : in    std_ulogic_vector(29 downto 0);
+            wb_ctrl_dat_w : in    std_ulogic_vector(31 downto 0);
+            wb_ctrl_dat_r : out   std_ulogic_vector(31 downto 0);
+            wb_ctrl_sel   : in    std_ulogic_vector(3 downto 0);
+            wb_ctrl_cyc   : in    std_ulogic;
+            wb_ctrl_stb   : in    std_ulogic;
+            wb_ctrl_ack   : out   std_ulogic;
+            wb_ctrl_we    : in    std_ulogic;
+            wb_ctrl_cti   : in    std_ulogic_vector(2 downto 0);
+            wb_ctrl_bte   : in    std_ulogic_vector(1 downto 0);
+            wb_ctrl_err   : out   std_ulogic;
+            -- wishbone for SD card core to use for DMA
+            wb_dma_adr    : out   std_ulogic_vector(29 downto 0);
+            wb_dma_dat_w  : out   std_ulogic_vector(31 downto 0);
+            wb_dma_dat_r  : in    std_ulogic_vector(31 downto 0);
+            wb_dma_sel    : out   std_ulogic_vector(3 downto 0);
+            wb_dma_cyc    : out   std_ulogic;
+            wb_dma_stb    : out   std_ulogic;
+            wb_dma_ack    : in    std_ulogic;
+            wb_dma_we     : out   std_ulogic;
+            wb_dma_cti    : out   std_ulogic_vector(2 downto 0);
+            wb_dma_bte    : out   std_ulogic_vector(1 downto 0);
+            wb_dma_err    : in    std_ulogic;
+            -- connections to SD card
+            sdcard_data   : inout std_ulogic_vector(3 downto 0);
+            sdcard_cmd    : inout std_ulogic;
+            sdcard_clk    : out   std_ulogic;
+            sdcard_cd     : in    std_ulogic;
+            irq           : out   std_ulogic
+            );
+        end component;
+
+        signal wb_sdcard_cyc : std_ulogic;
+        signal wb_sdcard_adr : std_ulogic_vector(29 downto 0);
+
+    begin
+        litesdcard : litesdcard_core
+            port map (
+                clk           => system_clk,
+                rst           => soc_rst,
+                wb_ctrl_adr   => wb_sdcard_adr,
+                wb_ctrl_dat_w => wb_ext_io_in.dat,
+                wb_ctrl_dat_r => wb_sdcard_out.dat,
+                wb_ctrl_sel   => wb_ext_io_in.sel,
+                wb_ctrl_cyc   => wb_sdcard_cyc,
+                wb_ctrl_stb   => wb_ext_io_in.stb,
+                wb_ctrl_ack   => wb_sdcard_out.ack,
+                wb_ctrl_we    => wb_ext_io_in.we,
+                wb_ctrl_cti   => "000",
+                wb_ctrl_bte   => "00",
+                wb_ctrl_err   => open,
+                wb_dma_adr    => wb_sddma_nr.adr,
+                wb_dma_dat_w  => wb_sddma_nr.dat,
+                wb_dma_dat_r  => wb_sddma_ir.dat,
+                wb_dma_sel    => wb_sddma_nr.sel,
+                wb_dma_cyc    => wb_sddma_nr.cyc,
+                wb_dma_stb    => wb_sddma_nr.stb,
+                wb_dma_ack    => wb_sddma_ir.ack,
+                wb_dma_we     => wb_sddma_nr.we,
+                wb_dma_cti    => open,
+                wb_dma_bte    => open,
+                wb_dma_err    => '0',
+                sdcard_data   => sdcard_data,
+                sdcard_cmd    => sdcard_cmd,
+                sdcard_clk    => sdcard_clk,
+                sdcard_cd     => sdcard_cd,
+                irq           => ext_irq_sdcard
+                );
+
+        -- Gate cyc with chip select from SoC
+        wb_sdcard_cyc <= wb_ext_io_in.cyc and wb_ext_is_sdcard;
+
+        wb_sdcard_adr <= x"0000" & wb_ext_io_in.adr(13 downto 0);
+
+        wb_sdcard_out.stall <= not wb_sdcard_out.ack;
+
+        -- Convert non-pipelined DMA wishbone to pipelined by suppressing
+        -- non-acknowledged strobes
+        process(system_clk)
+        begin
+            if rising_edge(system_clk) then
+                wb_sddma_out <= wb_sddma_nr;
+                if wb_sddma_stb_sent = '1' or
+                    (wb_sddma_out.stb = '1' and wb_sddma_in.stall = '0') then
+                    wb_sddma_out.stb <= '0';
+                end if;
+                if wb_sddma_nr.cyc = '0' or wb_sddma_ir.ack = '1' then
+                    wb_sddma_stb_sent <= '0';
+                elsif wb_sddma_in.stall = '0' then
+                    wb_sddma_stb_sent <= wb_sddma_nr.stb;
+                end if;
+                wb_sddma_ir <= wb_sddma_in;
+            end if;
+        end process;
+
+    end generate;
+
+    -- Mux WB response on the IO bus
+    wb_ext_io_out <= wb_eth_out when wb_ext_is_eth = '1' else
+                     wb_sdcard_out when wb_ext_is_sdcard = '1' else
+                     wb_dram_ctrl_out;
+
+    led0_n <= system_clk_locked;
+    led1_n <= not soc_rst;
+
+end architecture behaviour;
--- a/fpga/wukong-v2.xdc
+++ b/fpga/wukong-v2.xdc
@ -0,0 +1,487 @@
+################################################################################
+# clkin, reset, uart pins...
+################################################################################
+
+set_property -dict { PACKAGE_PIN M21  IOSTANDARD LVCMOS33 } [get_ports { ext_clk }];
+
+set_property -dict { PACKAGE_PIN H7   IOSTANDARD LVCMOS33 } [get_ports { ext_rst_n }];
+
+set_property -dict { PACKAGE_PIN E3   IOSTANDARD LVCMOS33 } [get_ports { uart_main_tx }];
+set_property -dict { PACKAGE_PIN F3   IOSTANDARD LVCMOS33 } [get_ports { uart_main_rx }];
+
+################################################################################
+# LEDs
+################################################################################
+
+set_property -dict { PACKAGE_PIN V16  IOSTANDARD LVCMOS33 } [get_ports { led0_n }];
+set_property -dict { PACKAGE_PIN V17  IOSTANDARD LVCMOS33 } [get_ports { led1_n }];
+
+################################################################################
+# SPI Flash
+################################################################################ema
+
+set_property -dict { PACKAGE_PIN P18  IOSTANDARD LVCMOS33 } [get_ports { spi_flash_cs_n }];
+set_property -dict { PACKAGE_PIN R14  IOSTANDARD LVCMOS33 } [get_ports { spi_flash_mosi }];
+set_property -dict { PACKAGE_PIN R15  IOSTANDARD LVCMOS33 } [get_ports { spi_flash_miso }];
+set_property -dict { PACKAGE_PIN P14  IOSTANDARD LVCMOS33 } [get_ports { spi_flash_wp_n }];
+set_property -dict { PACKAGE_PIN N14  IOSTANDARD LVCMOS33 } [get_ports { spi_flash_hold_n }];
+
+################################################################################
+# Micro SD
+################################################################################
+
+set_property -dict { PACKAGE_PIN M5   IOSTANDARD LVCMOS33 SLEW FAST } [get_ports { sdcard_data[0] }];
+set_property -dict { PACKAGE_PIN M7   IOSTANDARD LVCMOS33 SLEW FAST } [get_ports { sdcard_data[1] }];
+set_property -dict { PACKAGE_PIN H6   IOSTANDARD LVCMOS33 SLEW FAST } [get_ports { sdcard_data[2] }];
+set_property -dict { PACKAGE_PIN J6   IOSTANDARD LVCMOS33 SLEW FAST } [get_ports { sdcard_data[3] }];
+set_property -dict { PACKAGE_PIN J8   IOSTANDARD LVCMOS33 SLEW FAST } [get_ports { sdcard_cmd }];
+set_property -dict { PACKAGE_PIN L4   IOSTANDARD LVCMOS33 SLEW FAST } [get_ports { sdcard_clk }];
+set_property -dict { PACKAGE_PIN N6   IOSTANDARD LVCMOS33 } [get_ports { sdcard_cd }];
+
+# Put registers into IOBs to improve timing
+set_property IOB true [get_cells -hierarchical -filter {NAME =~*.litesdcard/sdcard_*}]
+
+################################################################################
+# PMOD header J10 (high-speed, no protection resisters)
+################################################################################
+
+#set_property -dict { PACKAGE_PIN D5   IOSTANDARD LVCMOS33 } [get_ports { pmod_j10_1 }];
+#set_property -dict { PACKAGE_PIN G5   IOSTANDARD LVCMOS33 } [get_ports { pmod_j10_2 }];
+#set_property -dict { PACKAGE_PIN G7   IOSTANDARD LVCMOS33 } [get_ports { pmod_j10_3 }];
+#set_property -dict { PACKAGE_PIN G8   IOSTANDARD LVCMOS33 } [get_ports { pmod_j10_4 }];
+#set_property -dict { PACKAGE_PIN E5   IOSTANDARD LVCMOS33 } [get_ports { pmod_j10_7 }];
+#set_property -dict { PACKAGE_PIN E6   IOSTANDARD LVCMOS33 } [get_ports { pmod_j10_8 }];
+#set_property -dict { PACKAGE_PIN D6   IOSTANDARD LVCMOS33 } [get_ports { pmod_j10_9 }];
+#set_property -dict { PACKAGE_PIN G6   IOSTANDARD LVCMOS33 } [get_ports { pmod_j10_10 }];
+
+################################################################################
+# PMOD header J11 (high-speed, no protection resisters)
+################################################################################
+
+#set_property -dict { PACKAGE_PIN H4   IOSTANDARD LVCMOS33 } [get_ports { pmod_j11_1 }];
+#set_property -dict { PACKAGE_PIN F4   IOSTANDARD LVCMOS33 } [get_ports { pmod_j11_2 }];
+#set_property -dict { PACKAGE_PIN A4   IOSTANDARD LVCMOS33 } [get_ports { pmod_j11_3 }];
+#set_property -dict { PACKAGE_PIN A5   IOSTANDARD LVCMOS33 } [get_ports { pmod_j11_4 }];
+#set_property -dict { PACKAGE_PIN J4   IOSTANDARD LVCMOS33 } [get_ports { pmod_j11_7 }];
+#set_property -dict { PACKAGE_PIN G4   IOSTANDARD LVCMOS33 } [get_ports { pmod_j11_8 }];
+#set_property -dict { PACKAGE_PIN B4   IOSTANDARD LVCMOS33 } [get_ports { pmod_j11_9 }];
+#set_property -dict { PACKAGE_PIN B5   IOSTANDARD LVCMOS33 } [get_ports { pmod_j11_10 }];
+
+################################################################################
+# HDR 20X2 connector
+################################################################################
+
+## TODO
+
+################################################################################
+# Ethernet (generated by LiteX)
+################################################################################
+
+# eth_clocks:0.tx
+set_property LOC M2 [get_ports {eth_clocks_tx}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_clocks_tx}]
+
+# eth_clocks:0.gtx
+set_property LOC U1 [get_ports {eth_clocks_gtx}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_clocks_gtx}]
+
+# eth_clocks:0.rx
+set_property LOC P4 [get_ports {eth_clocks_rx}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_clocks_rx}]
+
+# eth:0.rst_n
+set_property LOC R1 [get_ports {eth_rst_n}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_rst_n}]
+
+# eth:0.mdio
+set_property LOC H1 [get_ports {eth_mdio}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_mdio}]
+
+# eth:0.mdc
+set_property LOC H2 [get_ports {eth_mdc}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_mdc}]
+
+# eth:0.rx_dv
+set_property LOC L3 [get_ports {eth_rx_dv}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_rx_dv}]
+
+# eth:0.rx_er
+set_property LOC U5 [get_ports {eth_rx_er}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_rx_er}]
+
+# eth:0.rx_data
+set_property LOC M4 [get_ports {eth_rx_data[0]}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_rx_data[0]}]
+
+# eth:0.rx_data
+set_property LOC N3 [get_ports {eth_rx_data[1]}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_rx_data[1]}]
+
+# eth:0.rx_data
+set_property LOC N4 [get_ports {eth_rx_data[2]}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_rx_data[2]}]
+
+# eth:0.rx_data
+set_property LOC P3 [get_ports {eth_rx_data[3]}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_rx_data[3]}]
+
+# eth:0.rx_data
+set_property LOC R3 [get_ports {eth_rx_data[4]}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_rx_data[4]}]
+
+# eth:0.rx_data
+set_property LOC T3 [get_ports {eth_rx_data[5]}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_rx_data[5]}]
+
+# eth:0.rx_data
+set_property LOC T4 [get_ports {eth_rx_data[6]}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_rx_data[6]}]
+
+# eth:0.rx_data
+set_property LOC T5 [get_ports {eth_rx_data[7]}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_rx_data[7]}]
+
+# eth:0.tx_en
+set_property LOC T2 [get_ports {eth_tx_en}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_tx_en}]
+
+# eth:0.tx_er
+set_property LOC J1 [get_ports {eth_tx_er}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_tx_er}]
+
+# eth:0.tx_data
+set_property LOC R2 [get_ports {eth_tx_data[0]}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_tx_data[0]}]
+
+# eth:0.tx_data
+set_property LOC P1 [get_ports {eth_tx_data[1]}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_tx_data[1]}]
+
+# eth:0.tx_data
+set_property LOC N2 [get_ports {eth_tx_data[2]}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_tx_data[2]}]
+
+# eth:0.tx_data
+set_property LOC N1 [get_ports {eth_tx_data[3]}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_tx_data[3]}]
+
+# eth:0.tx_data
+set_property LOC M1 [get_ports {eth_tx_data[4]}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_tx_data[4]}]
+
+# eth:0.tx_data
+set_property LOC L2 [get_ports {eth_tx_data[5]}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_tx_data[5]}]
+
+# eth:0.tx_data
+set_property LOC K2 [get_ports {eth_tx_data[6]}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_tx_data[6]}]
+
+# eth:0.tx_data
+set_property LOC K1 [get_ports {eth_tx_data[7]}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_tx_data[7]}]
+
+# eth:0.col
+set_property LOC U4 [get_ports {eth_col}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_col}]
+
+# eth:0.crs
+set_property LOC U2 [get_ports {eth_crs}]
+set_property IOSTANDARD LVCMOS33 [get_ports {eth_crs}]
+
+################################################################################
+# DRAM (generated by LiteX)
+################################################################################
+
+# ddram:0.a
+set_property LOC E17 [get_ports {ddram_a[0]}]
+set_property SLEW FAST [get_ports {ddram_a[0]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_a[0]}]
+
+# ddram:0.a
+set_property LOC G17 [get_ports {ddram_a[1]}]
+set_property SLEW FAST [get_ports {ddram_a[1]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_a[1]}]
+
+# ddram:0.a
+set_property LOC F17 [get_ports {ddram_a[2]}]
+set_property SLEW FAST [get_ports {ddram_a[2]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_a[2]}]
+
+# ddram:0.a
+set_property LOC C17 [get_ports {ddram_a[3]}]
+set_property SLEW FAST [get_ports {ddram_a[3]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_a[3]}]
+
+# ddram:0.a
+set_property LOC G16 [get_ports {ddram_a[4]}]
+set_property SLEW FAST [get_ports {ddram_a[4]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_a[4]}]
+
+# ddram:0.a
+set_property LOC D16 [get_ports {ddram_a[5]}]
+set_property SLEW FAST [get_ports {ddram_a[5]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_a[5]}]
+
+# ddram:0.a
+set_property LOC H16 [get_ports {ddram_a[6]}]
+set_property SLEW FAST [get_ports {ddram_a[6]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_a[6]}]
+
+# ddram:0.a
+set_property LOC E16 [get_ports {ddram_a[7]}]
+set_property SLEW FAST [get_ports {ddram_a[7]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_a[7]}]
+
+# ddram:0.a
+set_property LOC H14 [get_ports {ddram_a[8]}]
+set_property SLEW FAST [get_ports {ddram_a[8]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_a[8]}]
+
+# ddram:0.a
+set_property LOC F15 [get_ports {ddram_a[9]}]
+set_property SLEW FAST [get_ports {ddram_a[9]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_a[9]}]
+
+# ddram:0.a
+set_property LOC F20 [get_ports {ddram_a[10]}]
+set_property SLEW FAST [get_ports {ddram_a[10]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_a[10]}]
+
+# ddram:0.a
+set_property LOC H15 [get_ports {ddram_a[11]}]
+set_property SLEW FAST [get_ports {ddram_a[11]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_a[11]}]
+
+# ddram:0.a
+set_property LOC C18 [get_ports {ddram_a[12]}]
+set_property SLEW FAST [get_ports {ddram_a[12]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_a[12]}]
+
+# ddram:0.a
+set_property LOC G15 [get_ports {ddram_a[13]}]
+set_property SLEW FAST [get_ports {ddram_a[13]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_a[13]}]
+
+# ddram:0.ba
+set_property LOC B17 [get_ports {ddram_ba[0]}]
+set_property SLEW FAST [get_ports {ddram_ba[0]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_ba[0]}]
+
+# ddram:0.ba
+set_property LOC D18 [get_ports {ddram_ba[1]}]
+set_property SLEW FAST [get_ports {ddram_ba[1]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_ba[1]}]
+
+# ddram:0.ba
+set_property LOC A17 [get_ports {ddram_ba[2]}]
+set_property SLEW FAST [get_ports {ddram_ba[2]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_ba[2]}]
+
+# ddram:0.ras_n
+set_property LOC A19 [get_ports {ddram_ras_n}]
+set_property SLEW FAST [get_ports {ddram_ras_n}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_ras_n}]
+
+# ddram:0.cas_n
+set_property LOC B19 [get_ports {ddram_cas_n}]
+set_property SLEW FAST [get_ports {ddram_cas_n}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_cas_n}]
+
+# ddram:0.we_n
+set_property LOC A18 [get_ports {ddram_we_n}]
+set_property SLEW FAST [get_ports {ddram_we_n}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_we_n}]
+
+# ddram:0.dm
+set_property LOC A22 [get_ports {ddram_dm[0]}]
+set_property SLEW FAST [get_ports {ddram_dm[0]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_dm[0]}]
+
+# ddram:0.dm
+set_property LOC C22 [get_ports {ddram_dm[1]}]
+set_property SLEW FAST [get_ports {ddram_dm[1]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_dm[1]}]
+
+# ddram:0.dq
+set_property LOC D21 [get_ports {ddram_dq[0]}]
+set_property SLEW FAST [get_ports {ddram_dq[0]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_dq[0]}]
+set_property IN_TERM UNTUNED_SPLIT_40 [get_ports {ddram_dq[0]}]
+
+# ddram:0.dq
+set_property LOC C21 [get_ports {ddram_dq[1]}]
+set_property SLEW FAST [get_ports {ddram_dq[1]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_dq[1]}]
+set_property IN_TERM UNTUNED_SPLIT_40 [get_ports {ddram_dq[1]}]
+
+# ddram:0.dq
+set_property LOC B22 [get_ports {ddram_dq[2]}]
+set_property SLEW FAST [get_ports {ddram_dq[2]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_dq[2]}]
+set_property IN_TERM UNTUNED_SPLIT_40 [get_ports {ddram_dq[2]}]
+
+# ddram:0.dq
+set_property LOC B21 [get_ports {ddram_dq[3]}]
+set_property SLEW FAST [get_ports {ddram_dq[3]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_dq[3]}]
+set_property IN_TERM UNTUNED_SPLIT_40 [get_ports {ddram_dq[3]}]
+
+# ddram:0.dq
+set_property LOC D19 [get_ports {ddram_dq[4]}]
+set_property SLEW FAST [get_ports {ddram_dq[4]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_dq[4]}]
+set_property IN_TERM UNTUNED_SPLIT_40 [get_ports {ddram_dq[4]}]
+
+# ddram:0.dq
+set_property LOC E20 [get_ports {ddram_dq[5]}]
+set_property SLEW FAST [get_ports {ddram_dq[5]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_dq[5]}]
+set_property IN_TERM UNTUNED_SPLIT_40 [get_ports {ddram_dq[5]}]
+
+# ddram:0.dq
+set_property LOC C19 [get_ports {ddram_dq[6]}]
+set_property SLEW FAST [get_ports {ddram_dq[6]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_dq[6]}]
+set_property IN_TERM UNTUNED_SPLIT_40 [get_ports {ddram_dq[6]}]
+
+# ddram:0.dq
+set_property LOC D20 [get_ports {ddram_dq[7]}]
+set_property SLEW FAST [get_ports {ddram_dq[7]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_dq[7]}]
+set_property IN_TERM UNTUNED_SPLIT_40 [get_ports {ddram_dq[7]}]
+
+# ddram:0.dq
+set_property LOC C23 [get_ports {ddram_dq[8]}]
+set_property SLEW FAST [get_ports {ddram_dq[8]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_dq[8]}]
+set_property IN_TERM UNTUNED_SPLIT_40 [get_ports {ddram_dq[8]}]
+
+# ddram:0.dq
+set_property LOC D23 [get_ports {ddram_dq[9]}]
+set_property SLEW FAST [get_ports {ddram_dq[9]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_dq[9]}]
+set_property IN_TERM UNTUNED_SPLIT_40 [get_ports {ddram_dq[9]}]
+
+# ddram:0.dq
+set_property LOC B24 [get_ports {ddram_dq[10]}]
+set_property SLEW FAST [get_ports {ddram_dq[10]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_dq[10]}]
+set_property IN_TERM UNTUNED_SPLIT_40 [get_ports {ddram_dq[10]}]
+
+# ddram:0.dq
+set_property LOC B25 [get_ports {ddram_dq[11]}]
+set_property SLEW FAST [get_ports {ddram_dq[11]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_dq[11]}]
+set_property IN_TERM UNTUNED_SPLIT_40 [get_ports {ddram_dq[11]}]
+
+# ddram:0.dq
+set_property LOC C24 [get_ports {ddram_dq[12]}]
+set_property SLEW FAST [get_ports {ddram_dq[12]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_dq[12]}]
+set_property IN_TERM UNTUNED_SPLIT_40 [get_ports {ddram_dq[12]}]
+
+# ddram:0.dq
+set_property LOC C26 [get_ports {ddram_dq[13]}]
+set_property SLEW FAST [get_ports {ddram_dq[13]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_dq[13]}]
+set_property IN_TERM UNTUNED_SPLIT_40 [get_ports {ddram_dq[13]}]
+
+# ddram:0.dq
+set_property LOC A25 [get_ports {ddram_dq[14]}]
+set_property SLEW FAST [get_ports {ddram_dq[14]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_dq[14]}]
+set_property IN_TERM UNTUNED_SPLIT_40 [get_ports {ddram_dq[14]}]
+
+# ddram:0.dq
+set_property LOC B26 [get_ports {ddram_dq[15]}]
+set_property SLEW FAST [get_ports {ddram_dq[15]}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_dq[15]}]
+set_property IN_TERM UNTUNED_SPLIT_40 [get_ports {ddram_dq[15]}]
+
+# ddram:0.dqs_p
+set_property LOC B20 [get_ports {ddram_dqs_p[0]}]
+set_property SLEW FAST [get_ports {ddram_dqs_p[0]}]
+set_property IOSTANDARD DIFF_SSTL135 [get_ports {ddram_dqs_p[0]}]
+set_property IN_TERM UNTUNED_SPLIT_40 [get_ports {ddram_dqs_p[0]}]
+
+# ddram:0.dqs_p
+set_property LOC A23 [get_ports {ddram_dqs_p[1]}]
+set_property SLEW FAST [get_ports {ddram_dqs_p[1]}]
+set_property IOSTANDARD DIFF_SSTL135 [get_ports {ddram_dqs_p[1]}]
+set_property IN_TERM UNTUNED_SPLIT_40 [get_ports {ddram_dqs_p[1]}]
+
+# ddram:0.dqs_n
+set_property LOC A20 [get_ports {ddram_dqs_n[0]}]
+set_property SLEW FAST [get_ports {ddram_dqs_n[0]}]
+set_property IOSTANDARD DIFF_SSTL135 [get_ports {ddram_dqs_n[0]}]
+set_property IN_TERM UNTUNED_SPLIT_40 [get_ports {ddram_dqs_n[0]}]
+
+# ddram:0.dqs_n
+set_property LOC A24 [get_ports {ddram_dqs_n[1]}]
+set_property SLEW FAST [get_ports {ddram_dqs_n[1]}]
+set_property IOSTANDARD DIFF_SSTL135 [get_ports {ddram_dqs_n[1]}]
+set_property IN_TERM UNTUNED_SPLIT_40 [get_ports {ddram_dqs_n[1]}]
+
+# ddram:0.clk_p
+set_property LOC F18 [get_ports {ddram_clk_p}]
+set_property SLEW FAST [get_ports {ddram_clk_p}]
+set_property IOSTANDARD DIFF_SSTL135 [get_ports {ddram_clk_p}]
+
+# ddram:0.clk_n
+set_property LOC F19 [get_ports {ddram_clk_n}]
+set_property SLEW FAST [get_ports {ddram_clk_n}]
+set_property IOSTANDARD DIFF_SSTL135 [get_ports {ddram_clk_n}]
+
+# ddram:0.cke
+set_property LOC E18 [get_ports {ddram_cke}]
+set_property SLEW FAST [get_ports {ddram_cke}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_cke}]
+
+# ddram:0.odt
+set_property LOC G19 [get_ports {ddram_odt}]
+set_property SLEW FAST [get_ports {ddram_odt}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_odt}]
+
+# ddram:0.reset_n
+set_property LOC H17 [get_ports {ddram_reset_n}]
+set_property SLEW FAST [get_ports {ddram_reset_n}]
+set_property IOSTANDARD SSTL135 [get_ports {ddram_reset_n}]
+
+################################################################################
+# Design constraints and bitsteam attributes
+################################################################################
+
+set_property INTERNAL_VREF 0.675 [get_iobanks 16]
+
+set_property CONFIG_VOLTAGE 3.3 [current_design]
+set_property CFGBVS VCCO [current_design]
+
+set_property BITSTREAM.GENERAL.COMPRESS TRUE [current_design]
+set_property BITSTREAM.CONFIG.CONFIGRATE 33 [current_design]
+set_property CONFIG_MODE SPIx4 [current_design]
+
+################################################################################
+# Clock constraints
+################################################################################
+
+create_clock -name sys_clk_pin -period 20.00 [get_ports { ext_clk }];
+
+create_clock -name eth_rx_clk -period 8.0 [get_nets has_liteeth.liteeth/eth_rx_clk]
+create_clock -name eth_tx_clk -period 8.0 [get_nets has_liteeth.liteeth/eth_tx_clk]
+
+set_clock_groups -group [get_clocks -include_generated_clocks -of [get_nets has_liteeth.liteeth/sys_clk]] -group [get_clocks -include_generated_clocks -of [get_nets has_liteeth.liteeth/eth_rx_clk]] -asynchronous
+
+set_clock_groups -group [get_clocks -include_generated_clocks -of [get_nets has_liteeth.liteeth/sys_clk]] -group [get_clocks -include_generated_clocks -of [get_nets has_liteeth.liteeth/eth_tx_clk]] -asynchronous
+
+set_clock_groups -group [get_clocks -include_generated_clocks -of [get_nets has_liteeth.liteeth/eth_rx_clk]] -group [get_clocks -include_generated_clocks -of [get_nets has_liteeth.liteeth/eth_tx_clk]] -asynchronous
+
+################################################################################
+# False path constraints (from LiteX as they relate to LiteDRAM and LiteEth)
+################################################################################
+
+set_false_path -quiet -through [get_nets -hierarchical -filter {mr_ff == TRUE}]
+
+set_false_path -quiet -to [get_pins -filter {REF_PIN_NAME == PRE} -of_objects [get_cells -hierarchical -filter {ars_ff1 == TRUE || ars_ff2 == TRUE}]]
+
+set_max_delay 2 -quiet -from [get_pins -filter {REF_PIN_NAME == C} -of_objects [get_cells -hierarchical -filter {ars_ff1 == TRUE}]] -to [get_pins -filter {REF_PIN_NAME == D} -of_objects [get_cells -hierarchical -filter {ars_ff2 == TRUE}]]
--- a/fpu.vhdl
+++ b/fpu.vhdl
@ -16,7 +16,7 @@ entity fpu is
        clk : in std_ulogic;
        rst : in std_ulogic;

-        e_in  : in  Execute1toFPUType;
+        e_in  : in  Execute1ToFPUType;
        e_out : out FPUToExecute1Type;

        w_out : out FPUToWritebackType
@ -73,8 +73,11 @@ architecture behaviour of fpu is
        busy         : std_ulogic;
        instr_done   : std_ulogic;
        do_intr      : std_ulogic;
+        illegal      : std_ulogic;
        op           : insn_type_t;
        insn         : std_ulogic_vector(31 downto 0);
+        nia          : std_ulogic_vector(63 downto 0);
+        instr_tag    : instr_tag_t;
        dest_fpr     : gspr_index_t;
        fe_mode      : std_ulogic;
        rc           : std_ulogic;
@ -157,7 +160,7 @@ architecture behaviour of fpu is

    constant BIN_ZERO : std_ulogic_vector(1 downto 0) := "00";
    constant BIN_R    : std_ulogic_vector(1 downto 0) := "01";
-    constant BIN_MASK : std_ulogic_vector(1 downto 0) := "10";
+    constant BIN_RND  : std_ulogic_vector(1 downto 0) := "10";
    constant BIN_PS6  : std_ulogic_vector(1 downto 0) := "11";

    constant RES_SUM   : std_ulogic_vector(1 downto 0) := "00";
@ -194,7 +197,7 @@ architecture behaviour of fpu is
    -- Each output value is the inverse of the center of the input
    -- range for the value, i.e. entry 0 is 1 / (1 + 1/512),
    -- entry 1 is 1 / (1 + 3/512), etc.
-    signal inverse_table : lookup_table := (
+    constant inverse_table : lookup_table := (
        -- 1/x lookup table
        -- Unit bit is assumed to be 1, so input range is [1, 2)
        18x"3fc01", 18x"3f411", 18x"3ec31", 18x"3e460", 18x"3dc9f", 18x"3d4ec", 18x"3cd49", 18x"3c5b5",
@ -546,6 +549,10 @@ begin
                r.do_intr <= '0';
                r.fpscr <= (others => '0');
                r.writing_back <= '0';
+                r.dest_fpr <= (others =>'0');
+                r.cr_mask <= (others =>'0');
+                r.cr_result <= (others =>'0');
+                r.instr_tag.valid <= '0';
            else
                assert not (r.state /= IDLE and e_in.valid = '1') severity failure;
                r <= rin;
@ -571,9 +578,9 @@ begin

    e_out.busy <= r.busy;
    e_out.exception <= r.fpscr(FPSCR_FEX);
-    e_out.interrupt <= r.do_intr;

    w_out.valid <= r.instr_done and not r.do_intr;
+    w_out.instr_tag <= r.instr_tag;
    w_out.write_enable <= r.writing_back;
    w_out.write_reg <= r.dest_fpr;
    w_out.write_data <= fp_result;
@ -581,6 +588,10 @@ begin
    w_out.write_cr_mask <= r.cr_mask;
    w_out.write_cr_data <= r.cr_result & r.cr_result & r.cr_result & r.cr_result &
                           r.cr_result & r.cr_result & r.cr_result & r.cr_result;
+    w_out.interrupt <= r.do_intr;
+    w_out.intr_vec <= 16#700#;
+    w_out.srr0 <= r.nia;
+    w_out.srr1 <= (47-44 => r.illegal, 47-43 => not r.illegal, others => '0');

    fpu_1: process(all)
        variable v           : reg_type;
@ -632,6 +643,7 @@ begin
        variable mulexp      : signed(EXP_BITS-1 downto 0);
        variable maddend     : std_ulogic_vector(127 downto 0);
        variable sum         : std_ulogic_vector(63 downto 0);
+        variable round_inc   : std_ulogic_vector(63 downto 0);
    begin
        v := r;
        illegal := '0';
@ -641,7 +653,9 @@ begin
        -- capture incoming instruction
        if e_in.valid = '1' then
            v.insn := e_in.insn;
+            v.nia := e_in.nia;
            v.op := e_in.op;
+            v.instr_tag := e_in.itag;
            v.fe_mode := or (e_in.fe_mode);
            v.dest_fpr := e_in.frt;
            v.single_prec := e_in.single;
@ -1117,7 +1131,6 @@ begin
                    elsif r.b.exponent > to_signed(127, EXP_BITS) then
                        v.state := ROUND_OFLOW;
                    else
-                        v.shift := to_signed(-2, EXP_BITS);
                        v.state := ROUNDING;
                    end if;
                else
@ -1619,7 +1632,6 @@ begin
                    -- sum overflowed, shift right
                    opsel_r <= RES_SHIFT;
                    set_x := '1';
-                    v.shift := to_signed(-2, EXP_BITS);
                    if exp_huge = '1' then
                        v.state := ROUND_OFLOW;
                    else
@ -1627,7 +1639,6 @@ begin
                    end if;
                elsif r.r(54) = '1' then
                    set_x := '1';
-                    v.shift := to_signed(-2, EXP_BITS);
                    v.state := ROUNDING;
                elsif (r_hi_nz or r_lo_nz or r.r(1) or r.r(0)) = '0' then
                    -- r.x must be zero at this point
@ -1704,22 +1715,19 @@ begin
                opsel_r <= RES_MULT;
                opsel_s <= S_MULT;
                set_s := '1';
-                v.shift := to_signed(56, EXP_BITS);
                if multiply_to_f.valid = '1' then
-                    if multiply_to_f.result(121) = '1' then
-                        v.state := FMADD_5;
-                    else
-                        v.state := FMADD_6;
-                    end if;
+                    v.state := FMADD_5;
                end if;

            when FMADD_5 =>
-                -- negate R:S:X
-                v.result_sign := not r.result_sign;
-                opsel_ainv <= '1';
-                carry_in <= not (s_nz or r.x);
-                opsel_s <= S_NEG;
-                set_s := '1';
+                -- negate R:S:X if negative
+                if r.r(63) = '1' then
+                    v.result_sign := not r.result_sign;
+                    opsel_ainv <= '1';
+                    carry_in <= not (s_nz or r.x);
+                    opsel_s <= S_NEG;
+                    set_s := '1';
+                end if;
                v.shift := to_signed(56, EXP_BITS);
                v.state := FMADD_6;

@ -2088,7 +2096,6 @@ begin
                -- r.shift = b.exponent - 52
                opsel_r <= RES_SHIFT;
                set_x := '1';
-                v.shift := to_signed(-2, EXP_BITS);
                v.state := ROUNDING;

            when FINISH =>
@ -2106,7 +2113,6 @@ begin
                    elsif exp_huge = '1' then
                        v.state := ROUND_OFLOW;
                    else
-                        v.shift := to_signed(-2, EXP_BITS);
                        v.state := ROUNDING;
                    end if;
                end if;
@ -2122,7 +2128,6 @@ begin
                elsif exp_huge = '1' then
                    v.state := ROUND_OFLOW;
                else
-                    v.shift := to_signed(-2, EXP_BITS);
                    v.state := ROUNDING;
                end if;

@ -2134,7 +2139,6 @@ begin
                    -- have to denormalize before rounding
                    opsel_r <= RES_SHIFT;
                    set_x := '1';
-                    v.shift := to_signed(-2, EXP_BITS);
                    v.state := ROUNDING;
                else
                    -- enabled underflow exception case
@ -2145,7 +2149,6 @@ begin
                        renormalize := '1';
                        v.state := NORMALIZE;
                    else
-                        v.shift := to_signed(-2, EXP_BITS);
                        v.state := ROUNDING;
                    end if;
                end if;
@ -2172,7 +2175,6 @@ begin
                else
                    -- enabled overflow exception
                    v.result_exp := r.result_exp - bias_exp;
-                    v.shift := to_signed(-2, EXP_BITS);
                    v.state := ROUNDING;
                end if;

@ -2181,9 +2183,8 @@ begin
                round := fp_rounding(r.r, r.x, r.single_prec, r.round_mode, r.result_sign);
                v.fpscr(FPSCR_FR downto FPSCR_FI) := round;
                if round(1) = '1' then
-                    -- set mask to increment the LSB for the precision
-                    opsel_b <= BIN_MASK;
-                    carry_in <= '1';
+                    -- increment the LSB for the precision
+                    opsel_b <= BIN_RND;
                    v.shift := to_signed(-1, EXP_BITS);
                    v.state := ROUNDING_2;
                else
@ -2405,8 +2406,9 @@ begin
                in_b0 := (others => '0');
            when BIN_R =>
                in_b0 := r.r;
-            when BIN_MASK =>
-                in_b0 := mask;
+            when BIN_RND =>
+                round_inc := (31 => r.single_prec, 2 => not r.single_prec, others => '0');
+                in_b0 := round_inc;
            when others =>
                -- BIN_PS6, 6 LSBs of P/4 sign-extended to 64
                in_b0 := std_ulogic_vector(resize(signed(r.p(7 downto 2)), 64));
@ -2423,7 +2425,10 @@ begin
        end if;
        sum := std_ulogic_vector(unsigned(in_a) + unsigned(in_b) + carry_in);
        if opsel_mask = '1' then
-            sum := sum and not mask;
+            sum(1 downto 0) := "00";
+            if r.single_prec = '1' then
+                sum(30 downto 2) := (others => '0');
+            end if;
        end if;
        case opsel_r is
            when RES_SUM =>
@ -2548,9 +2553,10 @@ begin
            v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX);
        end if;

+        v.illegal := illegal;
        if illegal = '1' then
            v.instr_done := '0';
-            v.do_intr := '0';
+            v.do_intr := '1';
            v.writing_back := '0';
            v.busy := '0';
            v.state := IDLE;
@ -2562,7 +2568,6 @@ begin
        end if;

        rin <= v;
-        e_out.illegal <= illegal;
    end process;

 end architecture behaviour;
--- a/gpio.vhdl
+++ b/gpio.vhdl
@ -0,0 +1,99 @@
+-- GPIO module for microwatt
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.wishbone_types.all;
+
+entity gpio is
+    generic (
+        NGPIO : integer := 32
+        );
+    port (
+        clk : in std_ulogic;
+        rst : in std_ulogic;
+
+        -- Wishbone
+        wb_in  : in wb_io_master_out;
+        wb_out : out wb_io_slave_out;
+
+        -- GPIO lines
+        gpio_in  : in std_ulogic_vector(NGPIO - 1 downto 0);
+        gpio_out : out std_ulogic_vector(NGPIO - 1 downto 0);
+        -- 1 = output, 0 = input
+        gpio_dir : out std_ulogic_vector(NGPIO - 1 downto 0);
+
+        -- Interrupt
+        intr : out std_ulogic
+        );
+end entity gpio;
+
+architecture behaviour of gpio is
+    constant GPIO_REG_BITS  : positive := 5;
+
+    -- Register addresses, matching addr downto 2, so 4 bytes per reg
+    constant GPIO_REG_DATA_OUT : std_ulogic_vector(GPIO_REG_BITS-1 downto 0) := "00000";
+    constant GPIO_REG_DATA_IN  : std_ulogic_vector(GPIO_REG_BITS-1 downto 0) := "00001";
+    constant GPIO_REG_DIR      : std_ulogic_vector(GPIO_REG_BITS-1 downto 0) := "00010";
+    constant GPIO_REG_DATA_SET : std_ulogic_vector(GPIO_REG_BITS-1 downto 0) := "00100";
+    constant GPIO_REG_DATA_CLR : std_ulogic_vector(GPIO_REG_BITS-1 downto 0) := "00101";
+
+    -- Current output value and direction
+    signal reg_data : std_ulogic_vector(NGPIO - 1 downto 0);
+    signal reg_dirn : std_ulogic_vector(NGPIO - 1 downto 0);
+    signal reg_in1  : std_ulogic_vector(NGPIO - 1 downto 0);
+    signal reg_in2  : std_ulogic_vector(NGPIO - 1 downto 0);
+
+    signal wb_rsp   : wb_io_slave_out;
+    signal reg_out  : std_ulogic_vector(NGPIO - 1 downto 0);
+
+begin
+
+    -- No interrupt facility for now
+    intr <= '0';
+
+    gpio_out <= reg_data;
+    gpio_dir <= reg_dirn;
+
+    -- Wishbone response
+    wb_rsp.ack <= wb_in.cyc and wb_in.stb;
+    with wb_in.adr(GPIO_REG_BITS - 1 downto 0) select reg_out <=
+        reg_data when GPIO_REG_DATA_OUT,
+        reg_in2  when GPIO_REG_DATA_IN,
+        reg_dirn when GPIO_REG_DIR,
+        (others => '0') when others;
+    wb_rsp.dat(wb_rsp.dat'left downto NGPIO) <= (others => '0');
+    wb_rsp.dat(NGPIO - 1 downto 0) <= reg_out;
+    wb_rsp.stall <= '0';
+
+    regs_rw: process(clk)
+    begin
+        if rising_edge(clk) then
+            wb_out <= wb_rsp;
+            reg_in2 <= reg_in1;
+            reg_in1 <= gpio_in;
+            if rst = '1' then
+                reg_data <= (others => '0');
+                reg_dirn <= (others => '0');
+                wb_out.ack <= '0';
+            else
+                if wb_in.cyc = '1' and wb_in.stb = '1' and wb_in.we = '1' then
+                    case wb_in.adr(GPIO_REG_BITS - 1 downto 0) is
+                        when GPIO_REG_DATA_OUT =>
+                            reg_data <= wb_in.dat(NGPIO - 1 downto 0);
+                        when GPIO_REG_DIR =>
+                            reg_dirn <= wb_in.dat(NGPIO - 1 downto 0);
+                        when GPIO_REG_DATA_SET =>
+                            reg_data <= reg_data or wb_in.dat(NGPIO - 1 downto 0);
+                        when GPIO_REG_DATA_CLR =>
+                            reg_data <= reg_data and not wb_in.dat(NGPIO - 1 downto 0);
+                        when others =>
+                    end case;
+                end if;
+            end if;
+        end if;
+    end process;
+
+end architecture behaviour;
+        
--- a/gpr_hazard.vhdl
+++ b/gpr_hazard.vhdl
@ -1,107 +0,0 @@
-library ieee;
-use ieee.std_logic_1164.all;
-use ieee.numeric_std.all;
-
-library work;
-use work.common.all;
-
-entity gpr_hazard is
-    generic (
-        PIPELINE_DEPTH : natural := 1
-        );
-    port(
-        clk                : in std_ulogic;
-        busy_in            : in std_ulogic;
-        deferred           : in std_ulogic;
-        complete_in        : in std_ulogic;
-        flush_in           : in std_ulogic;
-        issuing            : in std_ulogic;
-
-        gpr_write_valid_in : in std_ulogic;
-        gpr_write_in       : in gspr_index_t;
-        bypass_avail       : in std_ulogic;
-        gpr_read_valid_in  : in std_ulogic;
-        gpr_read_in        : in gspr_index_t;
-
-        ugpr_write_valid   : in std_ulogic;
-        ugpr_write_reg     : in gspr_index_t;
-
-        stall_out          : out std_ulogic;
-        use_bypass         : out std_ulogic
-        );
-end entity gpr_hazard;
-architecture behaviour of gpr_hazard is
-    type pipeline_entry_type is record
-        valid  : std_ulogic;
-        bypass : std_ulogic;
-        gpr    : gspr_index_t;
-        ugpr_valid : std_ulogic;
-        ugpr   : gspr_index_t;
-    end record;
-    constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0', gpr => (others => '0'),
-                                                           ugpr_valid => '0', ugpr => (others => '0'));
-
-    type pipeline_t is array(0 to PIPELINE_DEPTH) of pipeline_entry_type;
-    constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init);
-
-    signal r, rin : pipeline_t := pipeline_t_init;
-begin
-    gpr_hazard0: process(clk)
-    begin
-        if rising_edge(clk) then
-            r <= rin;
-        end if;
-    end process;
-
-    gpr_hazard1: process(all)
-        variable v     : pipeline_t;
-    begin
-        v := r;
-
-        if complete_in = '1' then
-            v(PIPELINE_DEPTH).valid := '0';
-            v(PIPELINE_DEPTH).ugpr_valid := '0';
-        end if;
-
-        stall_out <= '0';
-        use_bypass <= '0';
-        if gpr_read_valid_in = '1' then
-            loop_0: for i in 0 to PIPELINE_DEPTH loop
-                if v(i).valid = '1' and r(i).gpr = gpr_read_in then
-                    if r(i).bypass = '1' then
-                        use_bypass <= '1';
-                    else
-                        stall_out <= '1';
-                    end if;
-                end if;
-                if v(i).ugpr_valid = '1' and r(i).ugpr = gpr_read_in then
-                    stall_out <= '1';
-                end if;
-            end loop;
-        end if;
-
-        -- XXX assumes PIPELINE_DEPTH = 1
-        if busy_in = '0' then
-            v(1) := v(0);
-            v(0).valid := '0';
-            v(0).ugpr_valid := '0';
-        end if;
-        if deferred = '0' and issuing = '1' then
-            v(0).valid  := gpr_write_valid_in;
-            v(0).bypass := bypass_avail;
-            v(0).gpr    := gpr_write_in;
-            v(0).ugpr_valid := ugpr_write_valid;
-            v(0).ugpr   := ugpr_write_reg;
-        end if;
-        if flush_in = '1' then
-            v(0).valid := '0';
-            v(0).ugpr_valid := '0';
-            v(1).valid := '0';
-            v(1).ugpr_valid := '0';
-        end if;
-
-        -- update registers
-        rin <= v;
-
-    end process;
-end;
--- a/hello_world/head.S
+++ b/hello_world/head.S
@ -60,11 +60,25 @@ _start:

 .global boot_entry
 boot_entry:
+	LOAD_IMM64(%r10,__bss_start)
+	LOAD_IMM64(%r11,__bss_end)
+	subf	%r11,%r10,%r11
+	addi	%r11,%r11,63
+	srdi.	%r11,%r11,6
+	beq	2f
+	mtctr	%r11
+1:	dcbz	0,%r10
+	addi	%r10,%r10,64
+	bdnz	1b
+
 	/* setup stack */
-	LOAD_IMM64(%r1, STACK_TOP - 0x100)
+2:	LOAD_IMM64(%r1,__stack_top)
+	li	%r0,0
+	stdu	%r0,-32(%r1)
 	LOAD_IMM64(%r12, main)
-	mtctr	%r12,
+	mtctr	%r12
 	bctrl
+	attn // terminate on exit
 	b .

 #define EXCEPTION(nr)		\
--- a/hello_world/hello_world.bin
+++ b/hello_world/hello_world.bin
--- a/hello_world/hello_world.elf
+++ b/hello_world/hello_world.elf
--- a/hello_world/hello_world.hex
+++ b/hello_world/hello_world.hex
@ -35,24 +35,24 @@ a64b5a7d14004a39
 a602487d05009f42
 a64b5a7d14004a39
 2402004ca64b7b7d
-3c20000048000004
+3d40000048000004
+794a07c6614a0000
+614a1900654a0000
+616b00003d600000
+656b0000796b07c6
+7d6a5850616b1980
+796bd183396b003f
+7d6903a641820014
+394a00407c0057ec
+3c2000004200fff8
 782107c660210000
-60211f0064210000
-618c00003d800000
-658c0000798c07c6
-7d8903a6618c1014
-480000004e800421
-0000000000000000
-0000000000000000
-0000000000000000
-0000000000000000
-0000000000000000
-0000000000000000
-0000000000000000
-0000000000000000
-0000000000000000
-0000000000000000
-0000000000000000
+6021398064210000
+f801ffe138000000
+3d8000007c1243a6
+798c07c6618c0000
+618c1000658c0000
+4e8004217d8903a6
+4800000000000200
 0000000000000000
 0000000000000000
 0000000000000000
@ -510,150 +510,150 @@ a64b5a7d14004a39
 0000000000000000
 0000000000000000
 0000000000000000
-e8010010ebc1fff0
-7c0803a6ebe1fff8
-3c4000014e800020
-7c0802a638429800
-f8010010fbe1fff8
-480001edf821ffd1
-6000000060000000
-4800015538628000
-4800004960000000
-7c7f1b7860000000
-57ff063e5463063e
-60000000480000b9
-4082ffe02c1f000d
-480000a53860000a
-4bffffd060000000
-0100000000000000
-3c40000100000180
-6000000038429800
-6000000089228090
-2c09000039428088
-e92a000041820030
-7c0004ac39290014
-712900017d204eaa
-e86a00004182ffec
-7c601eaa7c0004ac
-4e8000205463063e
-39290010e92a0000
-7d204eea7c0004ac
-4082ffec71290001
-38630008e86a0000
-7c601eea7c0004ac
-000000004bffffd0
-0000000000000000
-384298003c400001
-8922809060000000
-3942808860000000
-4182002c2c090000
-39290014e92a0000
-7d204eaa7c0004ac
-4182ffec71290020
-7c0004ace92a0000
-4e8000207c604faa
-39290010e92a0000
-7d204eea7c0004ac
-4082ffec71290008
-e94a00005469063e
-7d2057ea7c0004ac
-000000004e800020
-0000000000000000
 384298003c400001
 fbe1fff87c0802a6
-3be3fffffbc1fff0
 f821ffd1f8010010
-2c3e00008fdf0001
-3821003040820010
-4bfffe4438600000
-4082000c281e000a
-4bffff453860000d
-4bffff3d7fc3f378
+60000000480001ed
+3862800060000000
+6000000048000155
+6000000048000049
+5463063e7c7f1b78
+480000b957ff063e
+2c1f000d60000000
+3860000a4082ffe0
+60000000480000a5
 000000004bffffd0
-0000028001000000
-386000007c691b78
-2c0a00007d4918ae
-386300014d820020
-000000004bfffff0
-0000000000000000
+0000018001000000
 384298003c400001
-614a00203d40c000
-7c0004ac794a0020
-3d20c0007d4056ea
-61290008794a0600
-7c0004ac79290020
-712900207d204eea
-3d20c00041820018
-7929002061290040
-7d204eea7c0004ac
-3d00c0007929f804
-6108200079290fc3
-6000000079080020
-3d00001cf9028088
-7d4a439261082000
-6000000041820084
-9922809039200001
-6108200c3d00c000
-790800203920ff80
-7d2047aa7c0004ac
-7c0004ace9228088
-e92280887d404faa
-39290004794ac202
-7d404faa7c0004ac
-39400003e9228088
-7c0004ac3929000c
-e92280887d404faa
+8922810860000000
+3942810060000000
+418200302c090000
+39290014e92a0000
+7d204eaa7c0004ac
+4182ffec71290001
+7c0004ace86a0000
+5463063e7c601eaa
+e92a00004e800020
 7c0004ac39290010
-e92280887d404faa
-3929000839400007
-7d404faa7c0004ac
-600000004e800020
-99228090394affff
-612920183d20c000
-7c0004ac79290020
-4e8000207d404fea
+712900017d204eea
+e86a00004082ffec
+7c0004ac38630008
+4bffffd07c601eea
 0000000000000000
 3c40000100000000
 6000000038429800
-2c24000089228090
-600000002f890000
-419e0030e9228088
-3940000241820024
-418200082c230000
-39290004614a0001
+6000000089228108
+2c09000039428100
+e92a00004182002c
+7c0004ac39290014
+712900207d204eaa
+e92a00004182ffec
+7c604faa7c0004ac
+e92a00004e800020
+7c0004ac39290010
+712900087d204eea
+5469063e4082ffec
+7c0004ace94a0000
+4e8000207d2057ea
+0000000000000000
+3c40000100000000
+7c0802a638429800
+fbc1fff0fbe1fff8
+f80100103be3ffff
+8fdf0001f821ffd1
+408200102c3e0000
+3860000038210030
+281e000a480001e8
+3860000d4082000c
+7fc3f3784bffff45
+4bffffd04bffff3d
+0100000000000000
+7c691b7800000280
+7d4918ae38600000
+4d8200202c0a0000
+4bfffff038630001
+0000000000000000
+3c40000100000000
+3d40c00038429800
+794a0020614a0020
+7d4056ea7c0004ac
+794a06003d20c000
+7929002061290008
+7d204eea7c0004ac
+4182001871290020
+612900403d20c000
+7c0004ac79290020
+7929f8047d204eea
+79290fc33d00c000
+7908002061082000
+f902810060000000
+610820003d00001c
+418200847d4a4392
+3920000160000000
+3d00c00099228108
+3920ff806108200c
+7c0004ac79080020
+e92281007d2047aa
 7d404faa7c0004ac
-394000004e800020
-418200084bffffe0
+794ac202e9228100
+7c0004ac39290004
+e92281007d404faa
+3929000c39400003
+7d404faa7c0004ac
+39290010e9228100
+7d404faa7c0004ac
+39400007e9228100
+7c0004ac39290008
+4e8000207d404faa
+394affff60000000
+3d20c00099228108
+7929002061292018
+7d404fea7c0004ac
+000000004e800020
+0000000000000000
+384298003c400001
+8922810860000000
+600000002c090000
+41820024e9228100
+78840e282c230000
+6084000141820008
+7c0004ac39290004
+4e8000207c804faa
+418200082c240000
 3929002060630002
 7c604fea7c0004ac
 000000004e800020
 0000000000000000
-0000000000000010
-0141780400527a01
-0000001800010c1b
-fffffc4800000018
-300e460000000070
-000000019f7e4111
-0000000000000010
-0141780400527a01
-0000001000010c1b
-fffffc8800000018
-0000000000000084
-0000002c00000010
-00000080fffffcf8
-0000002800000000
-fffffd6400000040
-4109450000000060
-300e43029e019f00
-42000e0a447e4111
-0000000b4106dedf
-0000006c00000010
-00000028fffffd98
+e8010010ebc1fff0
+7c0803a6ebe1fff8
+000000104e800020
+00527a0100000000
+00010c1b01417804
+0000001800000018
+00000070fffffc40
+9f7e4111300e4600
+0000001000000001
+00527a0100000000
+00010c1b01417804
+0000001800000010
+00000084fffffc80
+0000001000000000
+fffffcf00000002c
+0000000000000080
+0000004000000028
+00000060fffffd5c
+9e019f0041094500
+447e4111300e4302
+4106dedf42000e0a
+000000100000000b
+fffffd900000006c
+0000000000000028
+0000008000000010
+0000012cfffffda4
 0000001000000000
-fffffdac00000080
-000000000000012c
-0000009400000010
-00000074fffffec4
+fffffebc00000094
+0000000000000068
+0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
--- a/hello_world/powerpc.lds
+++ b/hello_world/powerpc.lds
@ -1,13 +1,27 @@
 SECTIONS
 {
-	_start = .;
 	. = 0;
+	_start = .;
 	.head : {
 		KEEP(*(.head))
- 	}
+	}
 	. = 0x1000;
-	.text : { *(.text) }
+	.text : { *(.text) *(.text.*) *(.rodata) *(.rodata.*) }
 	. = 0x1800;
-	.data : { *(.data) }
-	.bss : { *(.bss) }
+	.data : { *(.data) *(.data.*) *(.got) *(.toc) }
+	. = ALIGN(0x80);
+	__bss_start = .;
+	.bss : {
+		*(.dynsbss)
+		*(.sbss)
+		*(.scommon)
+		*(.dynbss)
+		*(.bss)
+		*(.common)
+		*(.bss.*)
+	}
+	. = ALIGN(0x80);
+	__bss_end = .;
+	. = . + 0x2000;
+	__stack_top = .;
 }
--- a/helpers.vhdl
+++ b/helpers.vhdl
@ -28,7 +28,9 @@ package helpers is

    function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector;
    function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector;
+    function edgelocation(v: std_ulogic_vector; nbits: natural) return std_ulogic_vector;
    function count_left_zeroes(val: std_ulogic_vector) return std_ulogic_vector;
+    function count_right_zeroes(val: std_ulogic_vector) return std_ulogic_vector;
 end package helpers;

 package body helpers is
@ -247,16 +249,50 @@ package body helpers is
        return ret;
    end;

-    -- Count leading zeroes operation
+    -- Assuming the input 'v' is a value of the form 1...10...0,
+    -- the output is the bit number of the rightmost 1 bit in v.
+    -- If v is zero, the result is zero.
+    function edgelocation(v: std_ulogic_vector; nbits: natural) return std_ulogic_vector is
+        variable p: std_ulogic_vector(nbits - 1 downto 0);
+        variable stride: natural;
+        variable b: std_ulogic;
+        variable k: natural;
+    begin
+        stride := 2;
+        for i in 0 to nbits - 1 loop
+            b := '0';
+            for j in 0 to (2**nbits / stride) - 1 loop
+                k := j * stride;
+                b := b or (v(k + stride - 1) and not v(k + (stride/2) - 1));
+            end loop;
+            p(i) := b;
+            stride := stride * 2;
+        end loop;
+        return p;
+    end function;
+
+    -- Count leading zeroes operations
    -- Assumes the value passed in is not zero (if it is, zero is returned)
-    function count_left_zeroes(val: std_ulogic_vector) return std_ulogic_vector is
-        variable rev: std_ulogic_vector(val'left downto val'right);
+    function count_right_zeroes(val: std_ulogic_vector) return std_ulogic_vector is
        variable sum: std_ulogic_vector(val'left downto val'right);
        variable onehot: std_ulogic_vector(val'left downto val'right);
+        variable edge: std_ulogic_vector(val'left downto val'right);
+        variable bn, bn_e, bn_o: std_ulogic_vector(5 downto 0);
+    begin
+        sum := std_ulogic_vector(- signed(val));
+        onehot := sum and val;
+        edge := sum or val;
+        bn_e := edgelocation(std_ulogic_vector(resize(signed(edge), 64)), 6);
+        bn_o := bit_number(std_ulogic_vector(resize(unsigned(onehot), 64)));
+        bn := bn_e(5 downto 2) & bn_o(1 downto 0);
+        return bn;
+    end;
+
+    function count_left_zeroes(val: std_ulogic_vector) return std_ulogic_vector is
+        variable rev: std_ulogic_vector(val'left downto val'right);
    begin
        rev := bit_reverse(val);
-        sum := std_ulogic_vector(- signed(rev));
-        onehot := sum and rev;
-        return bit_number(std_ulogic_vector(resize(unsigned(onehot), 64)));
+        return count_right_zeroes(rev);
    end;
+
 end package body helpers;
--- a/icache.vhdl
+++ b/icache.vhdl
@ -46,8 +46,6 @@ entity icache is
        TLB_SIZE : positive := 64;
        -- L1 ITLB log_2(page_size)
        TLB_LG_PGSZ : positive := 12;
-        -- Number of real address bits that we store
-        REAL_ADDR_BITS : positive := 56;
        -- Non-zero to enable log data collection
        LOG_LENGTH : natural := 0
        );
@ -68,6 +66,9 @@ entity icache is
        wishbone_out : out wishbone_master_out;
        wishbone_in  : in wishbone_slave_out;

+        wb_snoop_in  : in wishbone_master_out := wishbone_master_out_init;
+
+        events       : out IcacheEventType;
        log_out      : out std_ulogic_vector(53 downto 0)
        );
 end entity icache;
@ -168,7 +169,7 @@ architecture rtl of icache is
    signal eaa_priv  : std_ulogic;

    -- Cache reload state machine
-    type state_t is (IDLE, CLR_TAG, WAIT_ACK);
+    type state_t is (IDLE, STOP_RELOAD, CLR_TAG, WAIT_ACK);

    type reg_internal_t is record
 	-- Cache hit state (Latches for 1 cycle BRAM access)
@ -176,6 +177,7 @@ architecture rtl of icache is
 	hit_nia   : std_ulogic_vector(63 downto 0);
 	hit_smark : std_ulogic;
 	hit_valid : std_ulogic;
+        big_endian: std_ulogic;

 	-- Cache miss state (reload state machine)
        state            : state_t;
@ -194,6 +196,8 @@ architecture rtl of icache is

    signal r : reg_internal_t;

+    signal ev : IcacheEventType;
+
    -- Async signals on incoming request
    signal req_index   : index_t;
    signal req_row     : row_t;
@ -201,14 +205,13 @@ architecture rtl of icache is
    signal req_tag     : cache_tag_t;
    signal req_is_hit  : std_ulogic;
    signal req_is_miss : std_ulogic;
-    signal req_laddr   : std_ulogic_vector(63 downto 0);
+    signal req_raddr   : real_addr_t;

    signal tlb_req_index : tlb_index_t;
-    signal real_addr     : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0);
+    signal real_addr     : real_addr_t;
    signal ra_valid      : std_ulogic;
    signal priv_fault    : std_ulogic;
    signal access_ok     : std_ulogic;
-    signal use_previous  : std_ulogic;

    -- Cache RAM interface
    type cache_ram_out_t is array(way_t) of cache_row_t;
@ -219,14 +222,19 @@ architecture rtl of icache is
    signal plru_victim : plru_out_t;
    signal replace_way : way_t;

+    -- Memory write snoop signals
+    signal snoop_valid : std_ulogic;
+    signal snoop_index : index_t;
+    signal snoop_hits  : cache_way_valids_t;
+
    -- Return the cache line index (tag index) for an address
-    function get_index(addr: std_ulogic_vector(63 downto 0)) return index_t is
+    function get_index(addr: std_ulogic_vector) return index_t is
    begin
        return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)));
    end;

    -- Return the cache row index (data memory) for an address
-    function get_row(addr: std_ulogic_vector(63 downto 0)) return row_t is
+    function get_row(addr: std_ulogic_vector) return row_t is
    begin
        return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)));
    end;
@ -240,9 +248,9 @@ architecture rtl of icache is
    end;

    -- Returns whether this is the last row of a line
-    function is_last_row_addr(addr: wishbone_addr_type; last: row_in_line_t) return boolean is
+    function is_last_row_wb_addr(wb_addr: wishbone_addr_type; last: row_in_line_t) return boolean is
    begin
-	return unsigned(addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)) = last;
+	return unsigned(wb_addr(LINE_OFF_BITS - ROW_OFF_BITS - 1 downto 0)) = last;
    end;

    -- Returns whether this is the last row of a line
@ -252,16 +260,16 @@ architecture rtl of icache is
    end;

    -- Return the address of the next row in the current cache line
-    function next_row_addr(addr: wishbone_addr_type)
+    function next_row_wb_addr(wb_addr: wishbone_addr_type)
 	return std_ulogic_vector is
 	variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
 	variable result  : wishbone_addr_type;
    begin
 	-- Is there no simpler way in VHDL to generate that 3 bits adder ?
-	row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS);
+	row_idx := wb_addr(ROW_LINEBITS - 1 downto 0);
 	row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
-	result := addr;
-	result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
+	result := wb_addr;
+	result(ROW_LINEBITS - 1 downto 0) := row_idx;
 	return result;
    end;

@ -290,10 +298,9 @@ architecture rtl of icache is
    end;

    -- Get the tag value from the address
-    function get_tag(addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0);
-                     endian: std_ulogic) return cache_tag_t is
+    function get_tag(addr: real_addr_t; endian: std_ulogic) return cache_tag_t is
    begin
-        return endian & addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
+        return endian & addr(addr'left downto SET_SIZE_BITS);
    end;

    -- Read a tag from a tag memory row
@ -389,7 +396,7 @@ begin
                    wr_dat(ii * 8 + 7 downto ii * 8) <= wishbone_in.dat(j * 8 + 7 downto j * 8);
                end loop;
            end if;
-	    do_read <= not (stall_in or use_previous);
+	    do_read <= not stall_in;
 	    do_write <= '0';
 	    if wishbone_in.ack = '1' and replace_way = i then
 		do_write <= '1';
@ -457,7 +464,7 @@ begin
            end if;
            eaa_priv <= pte(3);
        else
-            real_addr <= i_in.nia(REAL_ADDR_BITS - 1 downto 0);
+            real_addr <= addr_to_real(i_in.nia);
            ra_valid <= '1';
            eaa_priv <= '1';
        end if;
@ -486,6 +493,7 @@ begin
                itlb_ptes(wr_index) <= m_in.pte;
                itlb_valids(wr_index) <= '1';
            end if;
+            ev.itlb_miss_resolved <= m_in.tlbld and not rst;
        end if;
    end process;

@ -494,16 +502,6 @@ begin
 	variable is_hit  : std_ulogic;
 	variable hit_way : way_t;
    begin
-        -- i_in.sequential means that i_in.nia this cycle is 4 more than
-        -- last cycle.  If we read more than 32 bits at a time, had a cache hit
-        -- last cycle, and we don't want the first 32-bit chunk, then we can
-        -- keep the data we read last cycle and just use that.
-        if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then
-            use_previous <= i_in.sequential and r.hit_valid;
-        else
-            use_previous <= '0';
-        end if;
-
 	-- Extract line, row and tag from request
        req_index <= get_index(i_in.nia);
        req_row <= get_row(i_in.nia);
@ -512,8 +510,7 @@ begin
 	-- Calculate address of beginning of cache row, will be
 	-- used for cache miss processing if needed
 	--
-	req_laddr <= (63 downto REAL_ADDR_BITS => '0') &
-                     real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
+	req_raddr <= real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
 		     (ROW_OFF_BITS-1 downto 0 => '0');

 	-- Test if pending request is a hit on any way
@ -558,11 +555,18 @@ begin
 	--       I prefer not to do just yet as it would force fetch2 to know about
 	--       some of the cache geometry information.
 	--
-        i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
+	if r.hit_valid = '1' then
+            i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
+	else
+            i_out.insn <= (others => '0');
+	end if;
 	i_out.valid <= r.hit_valid;
 	i_out.nia <= r.hit_nia;
 	i_out.stop_mark <= r.hit_smark;
        i_out.fetch_failed <= r.fetch_failed;
+        i_out.big_endian <= r.big_endian;
+        i_out.next_predicted <= i_in.predicted;
+        i_out.next_pred_ntaken <= i_in.pred_ntaken;

 	-- Stall fetch1 if we have a miss on cache or TLB or a protection fault
 	stall_out <= not (is_hit and access_ok);
@ -577,8 +581,7 @@ begin
        if rising_edge(clk) then
            -- keep outputs to fetch2 unchanged on a stall
            -- except that flush or reset sets valid to 0
-            -- If use_previous, keep the same data as last cycle and use the second half
-            if stall_in = '1' or use_previous = '1' then
+            if stall_in = '1' then
                if rst = '1' or flush_in = '1' then
                    r.hit_valid <= '0';
                end if;
@ -603,6 +606,7 @@ begin
                -- Send stop marks and NIA down regardless of validity
                r.hit_smark <= i_in.stop_mark;
                r.hit_nia <= i_in.nia;
+                r.big_endian <= i_in.big_endian;
            end if;
 	end if;
    end process;
@ -610,9 +614,13 @@ begin
    -- Cache miss/reload synchronous machine
    icache_miss : process(clk)
 	variable tagset    : cache_tags_set_t;
-	variable stbs_done : boolean;
+        variable tag       : cache_tag_t;
+        variable snoop_addr : real_addr_t;
+        variable snoop_tag : cache_tag_t;
+        variable snoop_cache_tags : cache_tags_set_t;
    begin
        if rising_edge(clk) then
+            ev.icache_miss <= '0';
 	    -- On reset, clear all valid bits to force misses
            if rst = '1' then
 		for i in index_t loop
@ -629,13 +637,42 @@ begin

 		-- Not useful normally but helps avoiding tons of sim warnings
 		r.wb.adr <= (others => '0');
+
+                snoop_valid <= '0';
+                snoop_index <= 0;
+                snoop_hits <= (others => '0');
            else
+                -- Detect snooped writes and decode address into index and tag
+                -- Since we never write, any write should be snooped
+                snoop_valid <= wb_snoop_in.cyc and wb_snoop_in.stb and wb_snoop_in.we;
+                snoop_addr := addr_to_real(wb_to_addr(wb_snoop_in.adr));
+                snoop_index <= get_index(snoop_addr);
+                snoop_cache_tags := cache_tags(get_index(snoop_addr));
+                snoop_tag := get_tag(snoop_addr, '0');
+                snoop_hits <= (others => '0');
+                for i in way_t loop
+                    tag := read_tag(i, snoop_cache_tags);
+                    -- Ignore endian bit in comparison
+                    tag(TAG_BITS - 1) := '0';
+                    if tag = snoop_tag then
+                        snoop_hits(i) <= '1';
+                    end if;
+                end loop;
+
                -- Process cache invalidations
                if inval_in = '1' then
                    for i in index_t loop
                        cache_valids(i) <= (others => '0');
                    end loop;
                    r.store_valid <= '0';
+                else
+                    -- Do invalidations from snooped stores to memory, one
+                    -- cycle after the address appears on wb_snoop_in.
+                    for i in way_t loop
+                        if snoop_valid = '1' and snoop_hits(i) = '1' then
+                            cache_valids(snoop_index)(i) <= '0';
+                        end if;
+                    end loop;
                end if;

 		-- Main state machine
@ -655,18 +692,19 @@ begin
 			    " way:" & integer'image(replace_way) &
 			    " tag:" & to_hstring(req_tag) &
                            " RA:" & to_hstring(real_addr);
+                        ev.icache_miss <= '1';

 			-- Keep track of our index and way for subsequent stores
 			r.store_index <= req_index;
-			r.store_row <= get_row(req_laddr);
+			r.store_row <= get_row(req_raddr);
                        r.store_tag <= req_tag;
                        r.store_valid <= '1';
-                        r.end_row_ix <= get_row_of_line(get_row(req_laddr)) - 1;
+                        r.end_row_ix <= get_row_of_line(get_row(req_raddr)) - 1;

 			-- Prep for first wishbone read. We calculate the address of
 			-- the start of the cache line and start the WB cycle.
 			--
-			r.wb.adr <= req_laddr(r.wb.adr'left downto 0);
+			r.wb.adr <= addr_to_wb(req_raddr);
 			r.wb.cyc <= '1';
 			r.wb.stb <= '1';

@ -693,29 +731,30 @@ begin

                        r.state <= WAIT_ACK;
                    end if;
-		    -- Requests are all sent if stb is 0
-		    stbs_done := r.wb.stb = '0';

 		    -- If we are still sending requests, was one accepted ?
-		    if wishbone_in.stall = '0' and not stbs_done then
-			-- That was the last word ? We are done sending. Clear
-			-- stb and set stbs_done so we can handle an eventual last
-			-- ack on the same cycle.
+		    if wishbone_in.stall = '0' and r.wb.stb = '1' then
+			-- That was the last word ? We are done sending. Clear stb.
 			--
-			if is_last_row_addr(r.wb.adr, r.end_row_ix) then
+			if is_last_row_wb_addr(r.wb.adr, r.end_row_ix) then
 			    r.wb.stb <= '0';
-			    stbs_done := true;
 			end if;

 			-- Calculate the next row address
-			r.wb.adr <= next_row_addr(r.wb.adr);
+			r.wb.adr <= next_row_wb_addr(r.wb.adr);
 		    end if;

+                    -- Abort reload if we get an invalidation
+                    if inval_in = '1' then
+                        r.wb.stb <= '0';
+                        r.state <= STOP_RELOAD;
+                    end if;
+
 		    -- Incoming acks processing
 		    if wishbone_in.ack = '1' then
-                        r.rows_valid(r.store_row mod ROW_PER_LINE) <= '1';
+                        r.rows_valid(r.store_row mod ROW_PER_LINE) <= not inval_in;
 			-- Check for completion
-			if stbs_done and is_last_row(r.store_row, r.end_row_ix) then
+			if is_last_row(r.store_row, r.end_row_ix) then
 			    -- Complete wishbone cycle
 			    r.wb.cyc <= '0';

@ -729,6 +768,18 @@ begin
 			-- Increment store row counter
 			r.store_row <= next_row(r.store_row);
 		    end if;
+
+                when STOP_RELOAD =>
+                    -- Wait for all outstanding requests to be satisfied, then
+                    -- go to IDLE state.
+                    if get_row_of_line(r.store_row) = get_row_of_line(get_row(wb_to_addr(r.wb.adr))) then
+                        r.wb.cyc <= '0';
+                        r.state <= IDLE;
+                    end if;
+                    if wishbone_in.ack = '1' then
+			-- Increment store row counter
+			r.store_row <= next_row(r.store_row);
+		    end if;
 		end case;
 	    end if;

@ -758,7 +809,7 @@ begin
                log_data <= i_out.valid &
                            i_out.insn &
                            wishbone_in.ack &
-                            r.wb.adr(5 downto 3) &
+                            r.wb.adr(2 downto 0) &
                            r.wb.stb & r.wb.cyc &
                            wishbone_in.stall &
                            stall_out &
@ -773,4 +824,7 @@ begin
        end process;
        log_out <= log_data;
    end generate;
+
+    events <= ev;
+
 end;
--- a/icache_tb.vhdl
+++ b/icache_tb.vhdl
@ -34,7 +34,7 @@ begin
            i_out => i_in,
            m_in => m_out,
            stall_in => '0',
-	    flush_in => '0',
+            flush_in => '0',
            inval_in => '0',
            wishbone_out => wb_bram_in,
            wishbone_in => wb_bram_out
@ -73,7 +73,10 @@ begin
    begin
        i_out.req <= '0';
        i_out.nia <= (others => '0');
-	i_out.stop_mark <= '0';
+        i_out.stop_mark <= '0';
+        i_out.priv_mode <= '1';
+        i_out.virt_mode <= '0';
+        i_out.big_endian <= '0';

        m_out.tlbld <= '0';
        m_out.tlbie <= '0';
@ -93,10 +96,10 @@ begin

        assert i_in.valid = '1' severity failure;
        assert i_in.insn = x"00000001"
-	    report "insn @" & to_hstring(i_out.nia) &
-	    "=" & to_hstring(i_in.insn) &
-	    " expected 00000001"
-	    severity failure;
+            report "insn @" & to_hstring(i_out.nia) &
+            "=" & to_hstring(i_in.insn) &
+            " expected 00000001"
+            severity failure;

        i_out.req <= '0';

@ -109,10 +112,10 @@ begin
        wait until rising_edge(clk);
        assert i_in.valid = '1' severity failure;
        assert i_in.insn = x"00000002"
-	    report "insn @" & to_hstring(i_out.nia) &
-	    "=" & to_hstring(i_in.insn) &
-	    " expected 00000002"
-	    severity failure;
+            report "insn @" & to_hstring(i_out.nia) &
+            "=" & to_hstring(i_in.insn) &
+            " expected 00000002"
+            severity failure;
        wait until rising_edge(clk);

        -- another miss
@ -124,10 +127,10 @@ begin

        assert i_in.valid = '1' severity failure;
        assert i_in.insn = x"00000010"
-	    report "insn @" & to_hstring(i_out.nia) &
-	    "=" & to_hstring(i_in.insn) &
-	    " expected 00000010"
-	    severity failure;
+            report "insn @" & to_hstring(i_out.nia) &
+            "=" & to_hstring(i_in.insn) &
+            " expected 00000010"
+            severity failure;

        -- test something that aliases
        i_out.req <= '1';
@ -142,10 +145,10 @@ begin

        assert i_in.valid = '1' severity failure;
        assert i_in.insn = x"00000040"
-	    report "insn @" & to_hstring(i_out.nia) &
-	    "=" & to_hstring(i_in.insn) &
-	    " expected 00000040"
-	    severity failure;
+            report "insn @" & to_hstring(i_out.nia) &
+            "=" & to_hstring(i_in.insn) &
+            " expected 00000040"
+            severity failure;

        i_out.req <= '0';

--- a/include/microwatt_soc.h
+++ b/include/microwatt_soc.h
@ -17,6 +17,7 @@
 #define DRAM_CTRL_BASE	0xc8000000  /* LiteDRAM control registers */
 #define LETH_CSR_BASE	0xc8020000  /* LiteEth CSR registers */
 #define LETH_SRAM_BASE	0xc8030000  /* LiteEth MMIO space */
+#define LSDC_CSR_BASE	0xc8040000  /* LiteSDCard MMIO space */
 #define SPI_FLASH_BASE  0xf0000000  /* SPI Flash memory map */
 #define DRAM_INIT_BASE  0xff000000  /* Internal DRAM init firmware */

@ -40,6 +41,7 @@
 #define   SYS_REG_INFO_HAS_LARGE_SYSCON	        (1ull << 5)
 #define   SYS_REG_INFO_HAS_UART1 		(1ull << 6)
 #define   SYS_REG_INFO_HAS_ARTB                 (1ull << 7)
+#define   SYS_REG_INFO_HAS_LITESDCARD 		(1ull << 8)
 #define SYS_REG_BRAMINFO		0x10
 #define   SYS_REG_BRAMINFO_SIZE_MASK		0xfffffffffffffull
 #define SYS_REG_DRAMINFO		0x18
--- a/insn_helpers.vhdl
+++ b/insn_helpers.vhdl
@ -31,6 +31,7 @@ package insn_helpers is
    function insn_bh (insn_in : std_ulogic_vector) return std_ulogic_vector;
    function insn_d (insn_in : std_ulogic_vector) return std_ulogic_vector;
    function insn_ds (insn_in : std_ulogic_vector) return std_ulogic_vector;
+    function insn_dq (insn_in : std_ulogic_vector) return std_ulogic_vector;
    function insn_dx (insn_in : std_ulogic_vector) return std_ulogic_vector;
    function insn_to (insn_in : std_ulogic_vector) return std_ulogic_vector;
    function insn_bc (insn_in : std_ulogic_vector) return std_ulogic_vector;
@ -190,6 +191,11 @@ package body insn_helpers is
        return insn_in(15 downto 2);
    end;

+    function insn_dq (insn_in : std_ulogic_vector) return std_ulogic_vector is
+    begin
+        return insn_in(15 downto 4);
+    end;
+
    function insn_dx (insn_in : std_ulogic_vector) return std_ulogic_vector is
    begin
        return insn_in(15 downto 6) & insn_in(20 downto 16) & insn_in(0);
--- a/jtag_tap/tap_top.v
+++ b/jtag_tap/tap_top.v
@ -1,636 +0,0 @@
-//////////////////////////////////////////////////////////////////////
-////                                                              ////
-////  tap_top.v                                                   ////
-////                                                              ////
-////                                                              ////
-////  This file is part of the JTAG Test Access Port (TAP)        ////
-////  http://www.opencores.org/projects/jtag/                     ////
-////                                                              ////
-////  Author(s):                                                  ////
-////       Igor Mohor (igorm@opencores.org)                       ////
-////                                                              ////
-////                                                              ////
-////  All additional information is avaliable in the README.txt   ////
-////  file.                                                       ////
-////                                                              ////
-//////////////////////////////////////////////////////////////////////
-////                                                              ////
-//// Copyright (C) 2000 - 2003 Authors                            ////
-////                                                              ////
-//// This source file may be used and distributed without         ////
-//// restriction provided that this copyright statement is not    ////
-//// removed from the file and that any derivative work contains  ////
-//// the original copyright notice and the associated disclaimer. ////
-////                                                              ////
-//// This source file is free software; you can redistribute it   ////
-//// and/or modify it under the terms of the GNU Lesser General   ////
-//// Public License as published by the Free Software Foundation; ////
-//// either version 2.1 of the License, or (at your option) any   ////
-//// later version.                                               ////
-////                                                              ////
-//// This source is distributed in the hope that it will be       ////
-//// useful, but WITHOUT ANY WARRANTY; without even the implied   ////
-//// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR      ////
-//// PURPOSE.  See the GNU Lesser General Public License for more ////
-//// details.                                                     ////
-////                                                              ////
-//// You should have received a copy of the GNU Lesser General    ////
-//// Public License along with this source; if not, download it   ////
-//// from http://www.opencores.org/lgpl.shtml                     ////
-////                                                              ////
-//////////////////////////////////////////////////////////////////////
-//
-// CVS Revision History
-//
-// $Log: not supported by cvs2svn $
-// Revision 1.5  2004/01/18 09:27:39  simons
-// Blocking non blocking assignmenst fixed.
-//
-// Revision 1.4  2004/01/17 17:37:44  mohor
-// capture_dr_o added to ports.
-//
-// Revision 1.3  2004/01/14 13:50:56  mohor
-// 5 consecutive TMS=1 causes reset of TAP.
-//
-// Revision 1.2  2004/01/08 10:29:44  mohor
-// Control signals for tdo_pad_o mux are changed to negedge.
-//
-// Revision 1.1  2003/12/23 14:52:14  mohor
-// Directory structure changed. New version of TAP.
-//
-// Revision 1.10  2003/10/23 18:08:01  mohor
-// MBIST chain connection fixed.
-//
-// Revision 1.9  2003/10/23 16:17:02  mohor
-// CRC logic changed.
-//
-// Revision 1.8  2003/10/21 09:48:31  simons
-// Mbist support added.
-//
-// Revision 1.7  2002/11/06 14:30:10  mohor
-// Trst active high. Inverted on higher layer.
-//
-// Revision 1.6  2002/04/22 12:55:56  mohor
-// tdo_padoen_o changed to tdo_padoe_o. Signal is active high.
-//
-// Revision 1.5  2002/03/26 14:23:38  mohor
-// Signal tdo_padoe_o changed back to tdo_padoen_o.
-//
-// Revision 1.4  2002/03/25 13:16:15  mohor
-// tdo_padoen_o changed to tdo_padoe_o. Signal was always active high, just
-// not named correctly.
-//
-// Revision 1.3  2002/03/12 14:30:05  mohor
-// Few outputs for boundary scan chain added.
-//
-// Revision 1.2  2002/03/12 10:31:53  mohor
-// tap_top and dbg_top modules are put into two separate modules. tap_top
-// contains only tap state machine and related logic. dbg_top contains all
-// logic necessery for debugging.
-//
-// Revision 1.1  2002/03/08 15:28:16  mohor
-// Structure changed. Hooks for jtag chain added.
-//
-//
-//
-//
-
-// Top module
-module tap_top #(parameter
-                IDCODE_VALUE = 32'h14d57049,
-                IR_LENGTH    = 6)
-               (
-                // JTAG pads
-                tms_pad_i, 
-                tck_pad_i, 
-                trst_pad_i, 
-                tdi_pad_i, 
-                tdo_pad_o, 
-                tdo_padoe_o,
-
-                // TAP states
-                shift_dr_o,
-                pause_dr_o, 
-                update_dr_o,
-                capture_dr_o,
-                
-                // Select signals for boundary scan or mbist
-                extest_select_o, 
-                sample_preload_select_o,
-                mbist_select_o,
-                debug_select_o,
-                
-                // TDO signal that is connected to TDI of sub-modules.
-                tdo_o, 
-                
-                // TDI signals from sub-modules
-                debug_tdi_i,    // from debug module
-                bs_chain_tdi_i, // from Boundary Scan Chain
-                mbist_tdi_i     // from Mbist Chain
-              );
-
-
-// JTAG pins
-input   tms_pad_i;      // JTAG test mode select pad
-input   tck_pad_i;      // JTAG test clock pad
-input   trst_pad_i;     // JTAG test reset pad
-input   tdi_pad_i;      // JTAG test data input pad
-output  tdo_pad_o;      // JTAG test data output pad
-output  tdo_padoe_o;    // Output enable for JTAG test data output pad 
-
-// TAP states
-output  shift_dr_o;
-output  pause_dr_o;
-output  update_dr_o;
-output  capture_dr_o;
-
-// Select signals for boundary scan or mbist
-output  extest_select_o;
-output  sample_preload_select_o;
-output  mbist_select_o;
-output  debug_select_o;
-
-// TDO signal that is connected to TDI of sub-modules.
-output  tdo_o;
-
-// TDI signals from sub-modules
-input   debug_tdi_i;    // from debug module
-input   bs_chain_tdi_i; // from Boundary Scan Chain
-input   mbist_tdi_i;    // from Mbist Chain
-
-//Internal constants
-localparam EXTEST         = 6'b000000;
-localparam SAMPLE_PRELOAD = 6'b000001;
-localparam IDCODE         = 6'b001001;
-localparam DEBUG          = 6'b000011;
-localparam MBIST          = 6'b001010;
-localparam BYPASS         = 6'b111111;
-
-// Registers
-reg     test_logic_reset;
-reg     run_test_idle;
-reg     select_dr_scan;
-reg     capture_dr;
-reg     shift_dr;
-reg     exit1_dr;
-reg     pause_dr;
-reg     exit2_dr;
-reg     update_dr;
-reg     select_ir_scan;
-reg     capture_ir;
-reg     shift_ir, shift_ir_neg;
-reg     exit1_ir;
-reg     pause_ir;
-reg     exit2_ir;
-reg     update_ir;
-reg     extest_select;
-reg     sample_preload_select;
-reg     idcode_select;
-reg     mbist_select;
-reg     debug_select;
-reg     bypass_select;
-reg     tdo_pad_o;
-reg     tdo_padoe_o;
-reg     tms_q1, tms_q2, tms_q3, tms_q4;
-wire    tms_reset;
-
-assign tdo_o = tdi_pad_i;
-assign shift_dr_o = shift_dr;
-assign pause_dr_o = pause_dr;
-assign update_dr_o = update_dr;
-assign capture_dr_o = capture_dr;
-
-assign extest_select_o = extest_select;
-assign sample_preload_select_o = sample_preload_select;
-assign mbist_select_o = mbist_select;
-assign debug_select_o = debug_select;
-
-
-always @ (posedge tck_pad_i)
-begin
-  tms_q1 <= tms_pad_i;
-  tms_q2 <= tms_q1;
-  tms_q3 <= tms_q2;
-  tms_q4 <= tms_q3;
-end
-
-
-assign tms_reset = tms_q1 & tms_q2 & tms_q3 & tms_q4 & tms_pad_i;    // 5 consecutive TMS=1 causes reset
-
-
-/**********************************************************************************
-*                                                                                 *
-*   TAP State Machine: Fully JTAG compliant                                       *
-*                                                                                 *
-**********************************************************************************/
-
-// test_logic_reset state
-always @ (posedge tck_pad_i or posedge trst_pad_i)
-begin
-  if(trst_pad_i)
-    test_logic_reset<= 1'b1;
-  else if (tms_reset)
-    test_logic_reset<= 1'b1;
-  else
-    begin
-      if(tms_pad_i & (test_logic_reset | select_ir_scan))
-        test_logic_reset<= 1'b1;
-      else
-        test_logic_reset<= 1'b0;
-    end
-end
-
-// run_test_idle state
-always @ (posedge tck_pad_i or posedge trst_pad_i)
-begin
-  if(trst_pad_i)
-    run_test_idle<= 1'b0;
-  else if (tms_reset)
-    run_test_idle<= 1'b0;
-  else
-  if(~tms_pad_i & (test_logic_reset | run_test_idle | update_dr | update_ir))
-    run_test_idle<= 1'b1;
-  else
-    run_test_idle<= 1'b0;
-end
-
-// select_dr_scan state
-always @ (posedge tck_pad_i or posedge trst_pad_i)
-begin
-  if(trst_pad_i)
-    select_dr_scan<= 1'b0;
-  else if (tms_reset)
-    select_dr_scan<= 1'b0;
-  else
-  if(tms_pad_i & (run_test_idle | update_dr | update_ir))
-    select_dr_scan<= 1'b1;
-  else
-    select_dr_scan<= 1'b0;
-end
-
-// capture_dr state
-always @ (posedge tck_pad_i or posedge trst_pad_i)
-begin
-  if(trst_pad_i)
-    capture_dr<= 1'b0;
-  else if (tms_reset)
-    capture_dr<= 1'b0;
-  else
-  if(~tms_pad_i & select_dr_scan)
-    capture_dr<= 1'b1;
-  else
-    capture_dr<= 1'b0;
-end
-
-// shift_dr state
-always @ (posedge tck_pad_i or posedge trst_pad_i)
-begin
-  if(trst_pad_i)
-    shift_dr<= 1'b0;
-  else if (tms_reset)
-    shift_dr<= 1'b0;
-  else
-  if(~tms_pad_i & (capture_dr | shift_dr | exit2_dr))
-    shift_dr<= 1'b1;
-  else
-    shift_dr<= 1'b0;
-end
-
-// exit1_dr state
-always @ (posedge tck_pad_i or posedge trst_pad_i)
-begin
-  if(trst_pad_i)
-    exit1_dr<= 1'b0;
-  else if (tms_reset)
-    exit1_dr<= 1'b0;
-  else
-  if(tms_pad_i & (capture_dr | shift_dr))
-    exit1_dr<= 1'b1;
-  else
-    exit1_dr<= 1'b0;
-end
-
-// pause_dr state
-always @ (posedge tck_pad_i or posedge trst_pad_i)
-begin
-  if(trst_pad_i)
-    pause_dr<= 1'b0;
-  else if (tms_reset)
-    pause_dr<= 1'b0;
-  else
-  if(~tms_pad_i & (exit1_dr | pause_dr))
-    pause_dr<= 1'b1;
-  else
-    pause_dr<= 1'b0;
-end
-
-// exit2_dr state
-always @ (posedge tck_pad_i or posedge trst_pad_i)
-begin
-  if(trst_pad_i)
-    exit2_dr<= 1'b0;
-  else if (tms_reset)
-    exit2_dr<= 1'b0;
-  else
-  if(tms_pad_i & pause_dr)
-    exit2_dr<= 1'b1;
-  else
-    exit2_dr<= 1'b0;
-end
-
-// update_dr state
-always @ (posedge tck_pad_i or posedge trst_pad_i)
-begin
-  if(trst_pad_i)
-    update_dr<= 1'b0;
-  else if (tms_reset)
-    update_dr<= 1'b0;
-  else
-  if(tms_pad_i & (exit1_dr | exit2_dr))
-    update_dr<= 1'b1;
-  else
-    update_dr<= 1'b0;
-end
-
-// select_ir_scan state
-always @ (posedge tck_pad_i or posedge trst_pad_i)
-begin
-  if(trst_pad_i)
-    select_ir_scan<= 1'b0;
-  else if (tms_reset)
-    select_ir_scan<= 1'b0;
-  else
-  if(tms_pad_i & select_dr_scan)
-    select_ir_scan<= 1'b1;
-  else
-    select_ir_scan<= 1'b0;
-end
-
-// capture_ir state
-always @ (posedge tck_pad_i or posedge trst_pad_i)
-begin
-  if(trst_pad_i)
-    capture_ir<= 1'b0;
-  else if (tms_reset)
-    capture_ir<= 1'b0;
-  else
-  if(~tms_pad_i & select_ir_scan)
-    capture_ir<= 1'b1;
-  else
-    capture_ir<= 1'b0;
-end
-
-// shift_ir state
-always @ (posedge tck_pad_i or posedge trst_pad_i)
-begin
-  if(trst_pad_i)
-    shift_ir<= 1'b0;
-  else if (tms_reset)
-    shift_ir<= 1'b0;
-  else
-  if(~tms_pad_i & (capture_ir | shift_ir | exit2_ir))
-    shift_ir<= 1'b1;
-  else
-    shift_ir<= 1'b0;
-end
-
-// exit1_ir state
-always @ (posedge tck_pad_i or posedge trst_pad_i)
-begin
-  if(trst_pad_i)
-    exit1_ir<= 1'b0;
-  else if (tms_reset)
-    exit1_ir<= 1'b0;
-  else
-  if(tms_pad_i & (capture_ir | shift_ir))
-    exit1_ir<= 1'b1;
-  else
-    exit1_ir<= 1'b0;
-end
-
-// pause_ir state
-always @ (posedge tck_pad_i or posedge trst_pad_i)
-begin
-  if(trst_pad_i)
-    pause_ir<= 1'b0;
-  else if (tms_reset)
-    pause_ir<= 1'b0;
-  else
-  if(~tms_pad_i & (exit1_ir | pause_ir))
-    pause_ir<= 1'b1;
-  else
-    pause_ir<= 1'b0;
-end
-
-// exit2_ir state
-always @ (posedge tck_pad_i or posedge trst_pad_i)
-begin
-  if(trst_pad_i)
-    exit2_ir<= 1'b0;
-  else if (tms_reset)
-    exit2_ir<= 1'b0;
-  else
-  if(tms_pad_i & pause_ir)
-    exit2_ir<= 1'b1;
-  else
-    exit2_ir<= 1'b0;
-end
-
-// update_ir state
-always @ (posedge tck_pad_i or posedge trst_pad_i)
-begin
-  if(trst_pad_i)
-    update_ir<= 1'b0;
-  else if (tms_reset)
-    update_ir<= 1'b0;
-  else
-  if(tms_pad_i & (exit1_ir | exit2_ir))
-    update_ir<= 1'b1;
-  else
-    update_ir<= 1'b0;
-end
-
-/**********************************************************************************
-*                                                                                 *
-*   End: TAP State Machine                                                        *
-*                                                                                 *
-**********************************************************************************/
-
-
-
-/**********************************************************************************
-*                                                                                 *
-*   jtag_ir:  JTAG Instruction Register                                           *
-*                                                                                 *
-**********************************************************************************/
-reg [IR_LENGTH-1:0]  jtag_ir;          // Instruction register
-reg [IR_LENGTH-1:0]  latched_jtag_ir, latched_jtag_ir_neg;
-reg                   instruction_tdo;
-
-always @ (posedge tck_pad_i or posedge trst_pad_i)
-begin
-  if(trst_pad_i)
-    jtag_ir[IR_LENGTH-1:0] <= {IR_LENGTH{1'b0}};
-  else if(capture_ir)
-    jtag_ir <= 6'b000101;          // This value is fixed for easier fault detection
-  else if(shift_ir)
-    jtag_ir[IR_LENGTH-1:0] <= {tdi_pad_i, jtag_ir[IR_LENGTH-1:1]};
-end
-
-always @ (negedge tck_pad_i)
-begin
-  instruction_tdo <= jtag_ir[0];
-end
-/**********************************************************************************
-*                                                                                 *
-*   End: jtag_ir                                                                  *
-*                                                                                 *
-**********************************************************************************/
-
-
-
-/**********************************************************************************
-*                                                                                 *
-*   idcode logic                                                                  *
-*                                                                                 *
-**********************************************************************************/
-reg [31:0] idcode_reg;
-reg        idcode_tdo;
-
-always @ (posedge tck_pad_i)
-begin
-  if(idcode_select & shift_dr)
-    idcode_reg <= {tdi_pad_i, idcode_reg[31:1]};
-  else
-    idcode_reg <= IDCODE_VALUE;
-end
-
-always @ (negedge tck_pad_i)
-begin
-    idcode_tdo <= idcode_reg[0];
-end
-/**********************************************************************************
-*                                                                                 *
-*   End: idcode logic                                                             *
-*                                                                                 *
-**********************************************************************************/
-
-
-/**********************************************************************************
-*                                                                                 *
-*   Bypass logic                                                                  *
-*                                                                                 *
-**********************************************************************************/
-reg  bypassed_tdo;
-reg  bypass_reg;
-
-always @ (posedge tck_pad_i or posedge trst_pad_i)
-begin
-  if (trst_pad_i)
-    bypass_reg<= 1'b0;
-  else if(shift_dr)
-    bypass_reg<= tdi_pad_i;
-end
-
-always @ (negedge tck_pad_i)
-begin
-  bypassed_tdo <= bypass_reg;
-end
-/**********************************************************************************
-*                                                                                 *
-*   End: Bypass logic                                                             *
-*                                                                                 *
-**********************************************************************************/
-
-
-/**********************************************************************************
-*                                                                                 *
-*   Activating Instructions                                                       *
-*                                                                                 *
-**********************************************************************************/
-// Updating jtag_ir (Instruction Register)
-always @ (posedge tck_pad_i or posedge trst_pad_i)
-begin
-  if(trst_pad_i)
-    latched_jtag_ir <= IDCODE;   // IDCODE selected after reset
-  else if (tms_reset)
-    latched_jtag_ir <= IDCODE;   // IDCODE selected after reset
-  else if(update_ir)
-    latched_jtag_ir <= jtag_ir;
-end
-
-/**********************************************************************************
-*                                                                                 *
-*   End: Activating Instructions                                                  *
-*                                                                                 *
-**********************************************************************************/
-
-
-// Updating jtag_ir (Instruction Register)
-always @ (latched_jtag_ir)
-begin
-  extest_select           = 1'b0;
-  sample_preload_select   = 1'b0;
-  idcode_select           = 1'b0;
-  mbist_select            = 1'b0;
-  debug_select            = 1'b0;
-  bypass_select           = 1'b0;
-
-  case(latched_jtag_ir)    /* synthesis parallel_case */ 
-    EXTEST:            extest_select           = 1'b1;    // External test
-    SAMPLE_PRELOAD:    sample_preload_select   = 1'b1;    // Sample preload
-    IDCODE:            idcode_select           = 1'b1;    // ID Code
-    MBIST:             mbist_select            = 1'b1;    // Mbist test
-    DEBUG:             debug_select            = 1'b1;    // Debug
-    BYPASS:            bypass_select           = 1'b1;    // BYPASS
-    default:            bypass_select           = 1'b1;    // BYPASS
-  endcase
-end
-
-
-
-/**********************************************************************************
-*                                                                                 *
-*   Multiplexing TDO data                                                         *
-*                                                                                 *
-**********************************************************************************/
-always @ (shift_ir_neg or exit1_ir or instruction_tdo or latched_jtag_ir_neg or idcode_tdo or
-          debug_tdi_i or bs_chain_tdi_i or mbist_tdi_i or 
-          bypassed_tdo)
-begin
-  if(shift_ir_neg)
-    tdo_pad_o = instruction_tdo;
-  else
-    begin
-      case(latched_jtag_ir_neg)    // synthesis parallel_case
-        IDCODE:            tdo_pad_o = idcode_tdo;       // Reading ID code
-        DEBUG:             tdo_pad_o = debug_tdi_i;      // Debug
-        SAMPLE_PRELOAD:    tdo_pad_o = bs_chain_tdi_i;   // Sampling/Preloading
-        EXTEST:            tdo_pad_o = bs_chain_tdi_i;   // External test
-        MBIST:             tdo_pad_o = mbist_tdi_i;      // Mbist test
-        default:            tdo_pad_o = bypassed_tdo;     // BYPASS instruction
-      endcase
-    end
-end
-
-
-// Tristate control for tdo_pad_o pin
-always @ (negedge tck_pad_i)
-begin
-  tdo_padoe_o <= shift_ir | shift_dr | (pause_dr & debug_select);
-end
-/**********************************************************************************
-*                                                                                 *
-*   End: Multiplexing TDO data                                                    *
-*                                                                                 *
-**********************************************************************************/
-
-
-always @ (negedge tck_pad_i)
-begin
-  shift_ir_neg <= shift_ir;
-  latched_jtag_ir_neg <= latched_jtag_ir;
-end
-
-
-endmodule
--- a/litedram/extras/litedram-wrapper-l2.vhdl
+++ b/litedram/extras/litedram-wrapper-l2.vhdl
@ -13,6 +13,7 @@ entity litedram_wrapper is
 	DRAM_ABITS      : positive;
 	DRAM_ALINES     : natural;
 	DRAM_DLINES     : natural;
+	DRAM_CKLINES    : natural;
 	DRAM_PORT_WIDTH : positive;

        -- Pseudo-ROM payload
@ -69,8 +70,8 @@ entity litedram_wrapper is
        ddram_dq      : inout std_ulogic_vector(DRAM_DLINES-1 downto 0);
        ddram_dqs_p   : inout std_ulogic_vector(DRAM_DLINES/8-1 downto 0);
        ddram_dqs_n   : inout std_ulogic_vector(DRAM_DLINES/8-1 downto 0);
-        ddram_clk_p   : out std_ulogic;
-        ddram_clk_n   : out std_ulogic;
+        ddram_clk_p   : out std_ulogic_vector(DRAM_CKLINES-1 downto 0);
+        ddram_clk_n   : out std_ulogic_vector(DRAM_CKLINES-1 downto 0);
        ddram_cke     : out std_ulogic;
        ddram_odt     : out std_ulogic;
        ddram_reset_n : out std_ulogic
@ -93,8 +94,8 @@ architecture behaviour of litedram_wrapper is
        ddram_dq                       : inout std_ulogic_vector(DRAM_DLINES-1 downto 0);
        ddram_dqs_p                    : inout std_ulogic_vector(DRAM_DLINES/8-1 downto 0);
        ddram_dqs_n                    : inout std_ulogic_vector(DRAM_DLINES/8-1 downto 0);
-        ddram_clk_p                    : out std_ulogic;
-        ddram_clk_n                    : out std_ulogic;
+        ddram_clk_p                    : out std_ulogic_vector(DRAM_CKLINES-1 downto 0);
+        ddram_clk_n                    : out std_ulogic_vector(DRAM_CKLINES-1 downto 0);
        ddram_cke                      : out std_ulogic;
        ddram_odt                      : out std_ulogic;
        ddram_reset_n                  : out std_ulogic;
@ -163,7 +164,6 @@ architecture behaviour of litedram_wrapper is
    -- Select a WB word inside DRAM port width
    constant WB_WORD_COUNT              : positive := DRAM_DBITS/WBL;
    constant WB_WSEL_BITS               : positive := log2(WB_WORD_COUNT);
-    constant WB_WSEL_RIGHT              : positive := log2(WBL/8);

    -- BRAM organisation: We never access more than wishbone_data_bits at
    -- a time so to save resources we make the array only that wide, and
@ -312,10 +312,20 @@ architecture behaviour of litedram_wrapper is
    -- Helper functions to decode incoming requests
    --

+    -- Return the DRAM real address from a wishbone address
+    function get_real_addr(addr: wishbone_addr_type) return std_ulogic_vector is
+        variable ra: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0) := (others => '0');
+    begin
+        ra(REAL_ADDR_BITS - 1 downto wishbone_log2_width) :=
+            addr(REAL_ADDR_BITS - wishbone_log2_width - 1 downto 0);
+        return ra;
+    end;
+
    -- Return the cache line index (tag index) for an address
    function get_index(addr: wishbone_addr_type) return index_t is
    begin
-        return to_integer(unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)));
+        return to_integer(unsigned(addr(SET_SIZE_BITS - wishbone_log2_width - 1 downto
+                                        LINE_OFF_BITS - wishbone_log2_width)));
    end;

    -- Return the cache row index (data memory) for an address
@ -378,7 +388,8 @@ architecture behaviour of litedram_wrapper is
    -- Get the tag value from the address
    function get_tag(addr: wishbone_addr_type) return cache_tag_t is
    begin
-        return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
+        return addr(REAL_ADDR_BITS - wishbone_log2_width - 1 downto
+                    SET_SIZE_BITS - wishbone_log2_width);
    end;

    -- Read a tag from a tag memory row
@ -447,7 +458,7 @@ begin
                wb_ctrl_stb <= '0';
            else
                -- XXX Maybe only update addr when cyc = '1' to save power ?
-                wb_ctrl_adr   <= x"0000" & wb_ctrl_in.adr(15 downto 2);
+                wb_ctrl_adr   <= x"0000" & wb_ctrl_in.adr(13 downto 0);
                wb_ctrl_dat_w <= wb_ctrl_in.dat;
                wb_ctrl_sel   <= wb_ctrl_in.sel;
                wb_ctrl_we    <= wb_ctrl_in.we;
@ -608,7 +619,7 @@ begin
            if stall = '1' and wb_out.stall = '0' and wb_in.cyc = '1' and wb_in.stb = '1' then
                 wb_stash <= wb_in;
                 if TRACE then
-                     report "stashed wb req ! addr:" & to_hstring(wb_in.adr) &
+                     report "stashed wb req ! addr:" & to_hstring(wb_in.adr & "000") &
                         " we:" & std_ulogic'image(wb_in.we) &
                         " sel:" & to_hstring(wb_in.sel);
                 end if;
@ -621,7 +632,7 @@ begin
                    wb_req <= wb_stash;
                    wb_stash.cyc <= '0';
                    if TRACE then
-                        report "unstashed wb req ! addr:" & to_hstring(wb_stash.adr) &
+                        report "unstashed wb req ! addr:" & to_hstring(wb_stash.adr & "000") &
                            " we:" & std_ulogic'image(wb_stash.we) &
                            " sel:" & to_hstring(wb_stash.sel);
                    end if;
@ -636,7 +647,7 @@ begin

                    if TRACE then
                        if wb_in.cyc = '1' and wb_in.stb = '1' then
-                            report "latch new wb req ! addr:" & to_hstring(wb_in.adr) &
+                            report "latch new wb req ! addr:" & to_hstring(wb_in.adr & "000") &
                                " we:" & std_ulogic'image(wb_in.we) &
                                " sel:" & to_hstring(wb_in.sel);
                        end if;
@ -665,12 +676,12 @@ begin

            if TRACE then
                if req_op = OP_LOAD_HIT then
-                    report "Load hit addr:" & to_hstring(wb_req.adr) &
+                    report "Load hit addr:" & to_hstring(wb_req.adr & "000") &
                        " idx:" & integer'image(req_index) &
                        " tag:" & to_hstring(req_tag) &
                        " way:" & integer'image(req_hit_way);
                elsif req_op = OP_LOAD_MISS then
-                    report "Load miss addr:" & to_hstring(wb_req.adr);
+                    report "Load miss addr:" & to_hstring(wb_req.adr & "000");
                end if;
                if read_ack_0 = '1' then
                    report "read data:" & to_hstring(cache_out(read_way_0));
@ -771,20 +782,19 @@ begin
    begin
        -- Extract line, row and tag from request
        req_index <= get_index(wb_req.adr);
-        req_row <= get_row(wb_req.adr(REAL_ADDR_BITS-1 downto 0));
+        req_row <= get_row(get_real_addr(wb_req.adr));
        req_tag <= get_tag(wb_req.adr);

        -- Calculate address of beginning of cache row, will be
        -- used for cache miss processing if needed
-        req_laddr <= wb_req.adr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
-                     (ROW_OFF_BITS-1 downto 0 => '0');
+        req_laddr <= get_real_addr(wb_req.adr);


        -- Do we have a valid request in the WB latch ?
        valid := wb_req.cyc = '1' and wb_req.stb = '1';

        -- Store signals (hard wired for 64-bit wishbone at the moment)
-        req_wsl <= wb_req.adr(WB_WSEL_RIGHT+WB_WSEL_BITS-1 downto WB_WSEL_RIGHT);
+        req_wsl <= wb_req.adr(WB_WSEL_BITS-1 downto 0);
        for i in 0 to WB_WORD_COUNT-1 loop
            if to_integer(unsigned(req_wsl)) = i then
                req_we(WBSL*(i+1)-1 downto WBSL*i) <= wb_req.sel;
@ -892,7 +902,7 @@ begin
        variable stq_wsl  : std_ulogic_vector(WB_WSEL_BITS-1 downto 0);
    begin
        storeq_wr_data <= wb_req.dat & wb_req.sel &
-                          wb_req.adr(WB_WSEL_RIGHT+WB_WSEL_BITS-1 downto WB_WSEL_RIGHT);
+                          wb_req.adr(WB_WSEL_BITS-1 downto 0);

        -- Only queue stores if we can also send a command
        if req_op = OP_STORE_HIT or req_op = OP_STORE_MISS then
@ -927,13 +937,13 @@ begin
            if rising_edge(system_clk) then
                if req_op = OP_STORE_HIT then
                    report "Store hit to:" &
-                        to_hstring(wb_req.adr(DRAM_ABITS+3 downto 0)) &
+                        to_hstring(wb_req.adr(DRAM_ABITS downto 0) & "000") &
                        " data:" & to_hstring(req_wdata) &
                        " we:" & to_hstring(req_we) &
                        " V:" & std_ulogic'image(user_port0_cmd_ready);
                else
                    report "Store miss to:" &
-                        to_hstring(wb_req.adr(DRAM_ABITS+3 downto 0)) &
+                        to_hstring(wb_req.adr(DRAM_ABITS downto 0) & "000") &
                        " data:" & to_hstring(req_wdata) &
                        " we:" & to_hstring(req_we) &
                        " V:" & std_ulogic'image(user_port0_cmd_ready);
@ -954,7 +964,8 @@ begin
        if req_op = OP_STORE_HIT or req_op = OP_STORE_MISS then
            -- For stores, forward signals directly. Only send command if
            -- the FIFO can accept a store.
-            user_port0_cmd_addr  <= wb_req.adr(DRAM_ABITS+ROW_OFF_BITS-1 downto ROW_OFF_BITS);
+            user_port0_cmd_addr  <= wb_req.adr(DRAM_ABITS + ROW_OFF_BITS - wishbone_log2_width - 1 downto
+                                               ROW_OFF_BITS - wishbone_log2_width);
            user_port0_cmd_we    <= '1';
            user_port0_cmd_valid <= storeq_wr_ready;
        else
--- a/litedram/extras/sim_litedram.vhdl
+++ b/litedram/extras/sim_litedram.vhdl
@ -102,8 +102,8 @@ entity litedram_core is
 	ddram_dq                       : inout std_ulogic_vector(15 downto 0);
 	ddram_dqs_p                    : inout std_ulogic_vector(1 downto 0);
 	ddram_dqs_n                    : inout std_ulogic_vector(1 downto 0);
-	ddram_clk_p                    : out std_ulogic;
-	ddram_clk_n                    : out std_ulogic;
+	ddram_clk_p                    : out std_ulogic_vector(0 downto 0);
+	ddram_clk_n                    : out std_ulogic_vector(0 downto 0);
 	ddram_cke                      : out std_ulogic;
 	ddram_odt                      : out std_ulogic;
 	ddram_reset_n                  : out std_ulogic;
--- a/litedram/gen-src/acorn-cle-215.yml
+++ b/litedram/gen-src/acorn-cle-215.yml
@ -3,13 +3,11 @@

 {
    # General ------------------------------------------------------------------
-    "cpu":        "None",  # Type of CPU used for init/calib (vexriscv, lm32)
-    "cpu_variant":"standard",
+    "cpu":        "None",  # CPU type (ex vexriscv, serv, None)
    "speedgrade": -2,          # FPGA speedgrade
    "memtype":    "DDR3",      # DRAM type

    # PHY ----------------------------------------------------------------------
-    "cmd_delay":       0,             # Command additional delay (in taps)
    "cmd_latency":     0,             # Command additional latency
    "sdram_module":    "MT41K512M16", # SDRAM modules of the board or SO-DIMM
    "sdram_module_nb": 2,             # Number of byte groups
@ -33,10 +31,7 @@
    "user_ports": {
        "native_0": {
            "type": "native",
+            "block_until_ready": False,
        },
    },
-
-    # CSR Port -----------------------------------------------------------------
-    "csr_alignment"  : 32,
-    "csr_data_width" : 32,
 }
--- a/litedram/gen-src/arty.yml
+++ b/litedram/gen-src/arty.yml
@ -3,13 +3,11 @@

 {
    # General ------------------------------------------------------------------
-    "cpu":        "None",  # Type of CPU used for init/calib (vexriscv, lm32)
-    "cpu_variant":"standard",
+    "cpu":        "None",  # CPU type (ex vexriscv, serv, None)
    "speedgrade": -1,          # FPGA speedgrade
    "memtype":    "DDR3",      # DRAM type

    # PHY ----------------------------------------------------------------------
-    "cmd_delay":       0,             # Command additional delay (in taps)
    "cmd_latency":     0,             # Command additional latency
    "sdram_module":    "MT41K128M16", # SDRAM modules of the board or SO-DIMM
    "sdram_module_nb": 2,             # Number of byte groups
@ -33,10 +31,7 @@
    "user_ports": {
        "native_0": {
            "type": "native",
+            "block_until_ready": False,
        },
    },
-
-    # CSR Port -----------------------------------------------------------------
-    "csr_alignment"  : 32,
-    "csr_data_width" : 32,
 }
--- a/litedram/gen-src/dram-init-mem.vhdl
+++ b/litedram/gen-src/dram-init-mem.vhdl
@ -100,7 +100,7 @@ begin
        if rising_edge(clk) then
            oack <= '0';
            if (wb_in.cyc and wb_in.stb) = '1' then
-                adr := to_integer((unsigned(wb_in.adr(INIT_RAM_ABITS-1 downto 2))));
+                adr := to_integer((unsigned(wb_in.adr(INIT_RAM_ABITS - 3 downto 0))));
                if wb_in.we = '0' then
                   obuf <= init_ram(adr);
                else
--- a/litedram/gen-src/generate.py
+++ b/litedram/gen-src/generate.py
@ -1,17 +1,10 @@
 #!/usr/bin/python3

-from fusesoc.capi2.generator import Generator
 from litex.build.tools import write_to_file
 from litex.build.tools import replace_in_file
-from litex.build.generic_platform import *
-from litex.build.xilinx import XilinxPlatform
-from litex.build.lattice import LatticePlatform
-from litex.soc.integration.builder import *
 from litedram.gen import *
 import subprocess
 import os
-import sys
-import yaml
 import shutil

 def make_new_dir(base, added):
@ -28,9 +21,6 @@ gen_src_dir = os.path.join(base_dir, "gen-src")
 gen_dir = make_new_dir(base_dir, "generated")

 # Build the init code for microwatt-initialized DRAM
-#
-# XXX Not working yet
-#
 def build_init_code(build_dir, is_sim):

    # More path fudging
@ -45,7 +35,7 @@ def build_init_code(build_dir, is_sim):
    print(" lx src dir:", lxbios_src_dir)

    # Generate mem.h (hard wire size, it's not important)
-    mem_h = "#define MAIN_RAM_BASE 0x40000000\n#define MAIN_RAM_SIZE 0x10000000"
+    mem_h = "#define MAIN_RAM_BASE 0x40000000UL\n#define MAIN_RAM_SIZE 0x10000000UL\n"
    write_to_file(os.path.join(gen_inc_dir, "mem.h"), mem_h)

    # Environment
@ -55,18 +45,19 @@ def build_init_code(build_dir, is_sim):
    def add_var(k, v):
        env_vars.append("{}={}\n".format(k, _makefile_escape(v)))

-    add_var("BUILD_DIR", sw_dir)
-    add_var("SRC_DIR", src_dir)
-    add_var("GENINC_DIR", sw_inc_dir)
-    add_var("LXSRC_DIR", lxbios_src_dir)
+    makefile = os.path.join(src_dir, "Makefile")
+    cmd = ["make", "-C", build_dir, "-f", makefile]
+    cmd.append("BUILD_DIR=%s" % sw_dir)
+    cmd.append("SRC_DIR=%s" % src_dir)
+    cmd.append("GENINC_DIR=%s" % sw_inc_dir)
+    cmd.append("LXSRC_DIR=%s" % lxbios_src_dir)
+
    if is_sim:
-        add_var("EXTRA_CFLAGS", "-D__SIM__")
-    write_to_file(os.path.join(gen_inc_dir, "variables.mak"), "".join(env_vars))
+        cmd.append("EXTRA_CFLAGS=%s" % "-D__SIM__")

    # Build init code
    print(" Generating init software...")
-    makefile = os.path.join(src_dir, "Makefile")
-    r = subprocess.check_call(["make", "-C", build_dir, "-I", gen_inc_dir, "-f", makefile])
+    r = subprocess.check_call(cmd)
    print("Make result:", r)

    return os.path.join(sw_dir, "obj", "sdram_init.hex")
@ -76,48 +67,17 @@ def generate_one(t):
    print("Generating target:", t)

    # Is it a simulation ?
-    is_sim = t is "sim"
+    is_sim = "sim" in t

    # Muck with directory path
    build_dir = make_new_dir(build_top_dir, t)
    t_dir = make_new_dir(gen_dir, t)

-    # Grab config file
-    cfile = os.path.join(gen_src_dir, t  + ".yml")
-    core_config = yaml.load(open(cfile).read(), Loader=yaml.Loader)
-
-    ### TODO: Make most stuff below a function in litedram gen.py and
-    ###       call it rather than duplicate it
-    ###
-
-    # Convert YAML elements to Python/LiteX
-    for k, v in core_config.items():
-        replaces = {"False": False, "True": True, "None": None}
-        for r in replaces.keys():
-            if v == r:
-                core_config[k] = replaces[r]
-        if "clk_freq" in k:
-            core_config[k] = float(core_config[k])
-        if k == "sdram_module":
-            core_config[k] = getattr(litedram_modules, core_config[k])
-        if k == "sdram_phy":
-            core_config[k] = getattr(litedram_phys, core_config[k])
-
-    # Generate core
+    cmd = ["litedram_gen", "--output-dir=%s" % build_dir]
    if is_sim:
-        platform = SimPlatform("", io=[])
-    elif core_config["sdram_phy"] in [litedram_phys.ECP5DDRPHY]:
-        platform = LatticePlatform("LFE5UM5G-45F-8BG381C", io=[], toolchain="trellis")
-    elif core_config["sdram_phy"] in [litedram_phys.A7DDRPHY, litedram_phys.K7DDRPHY, litedram_phys.V7DDRPHY]:
-        platform = XilinxPlatform("", io=[], toolchain="vivado")
-    else:
-        raise ValueError("Unsupported SDRAM PHY: {}".format(core_config["sdram_phy"]))
-
-    soc      = LiteDRAMCore(platform, core_config, is_sim = is_sim, integrated_rom_size=0x6000)
-
-    # Build into build_dir
-    builder  = Builder(soc, output_dir=build_dir, compile_gateware=False)
-    vns      = builder.build(build_name="litedram_core", regular_comb=False)
+        cmd.append("--sim")
+    cmd.append("%s.yml" % t)
+    subprocess.check_call(cmd)

    # Grab generated gatewar dir
    gw_dir = os.path.join(build_dir, "gateware")
@ -140,7 +100,7 @@ def generate_one(t):

 def main():

-    targets = ['arty','nexys-video', 'genesys2', 'acorn-cle-215', 'sim']
+    targets = ['arty','nexys-video', 'genesys2', 'acorn-cle-215', 'wukong-v2', 'orangecrab-85-0.2', 'sim']
    for t in targets:
        generate_one(t)
    
--- a/litedram/gen-src/genesys2.yml
+++ b/litedram/gen-src/genesys2.yml
@ -3,8 +3,7 @@

 {
    # General ------------------------------------------------------------------
-    "cpu":        "None",  # Type of CPU used for init/calib (vexriscv, lm32)
-    "cpu_variant":"standard",
+    "cpu":        "None",  # CPU type (ex vexriscv, serv, None)
    "speedgrade": -2,          # FPGA speedgrade
    "memtype":    "DDR3",      # DRAM type

@ -13,12 +12,12 @@
    "sdram_module":    "MT41J256M16", # SDRAM modules of the board or SO-DIMM
    "sdram_module_nb": 4,             # Number of byte groups
    "sdram_rank_nb":   1,             # Number of ranks
-    "sdram_phy":       K7DDRPHY,      # Type of FPGA PHY
+    "sdram_phy":       "K7DDRPHY",    # Type of FPGA PHY

    # Electrical ---------------------------------------------------------------
-    "rtt_nom": "60ohm", # Nominal termination
-    "rtt_wr":  "60ohm", # Write termination
-    "ron":     "34ohm", # Output driver impedance
+    "rtt_nom": "60ohm",  # Nominal termination
+    "rtt_wr":  "60ohm",  # Write termination
+    "ron":     "34ohm",  # Output driver impedance

    # Frequency ----------------------------------------------------------------
    "input_clk_freq":   200e6, # Input clock frequency
@ -32,10 +31,7 @@
    "user_ports": {
        "native_0": {
            "type": "native",
+            "block_until_ready": False,
        },
    },
-
-    # CSR Port -----------------------------------------------------------------
-    "csr_alignment"  : 32,
-    "csr_data_width" : 32,
 }
--- a/litedram/gen-src/nexys-video.yml
+++ b/litedram/gen-src/nexys-video.yml
@ -3,13 +3,11 @@

 {
    # General ------------------------------------------------------------------
-    "cpu":        "None",  # Type of CPU used for init/calib (vexriscv, lm32)
-    "cpu_variant":"standard",
+    "cpu":        "None",  # CPU type (ex vexriscv, serv, None)
    "speedgrade": -1,          # FPGA speedgrade
    "memtype":    "DDR3",      # DRAM type

    # PHY ----------------------------------------------------------------------
-    "cmd_delay":       0,             # Command additional delay (in taps)
    "cmd_latency":     0,             # Command additional latency
    "sdram_module":    "MT41K256M16", # SDRAM modules of the board or SO-DIMM
    "sdram_module_nb": 2,             # Number of byte groups
@ -33,10 +31,7 @@
    "user_ports": {
        "native_0": {
            "type": "native",
+            "block_until_ready": False,
        },
    },
-
-    # CSR Port -----------------------------------------------------------------
-    "csr_alignment"  : 32,
-    "csr_data_width" : 32,
 }
--- a/litedram/gen-src/orangecrab-85-0.2.yml
+++ b/litedram/gen-src/orangecrab-85-0.2.yml
@ -0,0 +1,39 @@
+# Matt Johnston 2021
+# Based on parameters from Greg Davill's Orangecrab-test-sw
+
+{
+    "cpu":        "None",  # CPU type (ex vexriscv, serv, None)
+    "device": "LFE5U-85F-8MG285C",
+    "memtype":    "DDR3",      # DRAM type
+
+    "sdram_module":    "MT41K256M16", # SDRAM modules of the board or SO-DIMM
+    "sdram_module_nb": 2,             # Number of byte groups
+    "sdram_rank_nb":   1,             # Number of ranks
+    "sdram_phy":       "ECP5DDRPHY",    # Type of FPGA PHY
+
+    # Electrical ---------------------------------------------------------------
+    "rtt_nom": "disabled",  # Nominal termination. ("disabled" from orangecrab)
+    "rtt_wr":  "60ohm",  # Write termination. (Default)
+    "ron":     "34ohm",  # Output driver impedance. (Default)
+
+    # Frequency ----------------------------------------------------------------
+    "init_clk_freq":   24e6,
+    "input_clk_freq":   48e6, # Input clock frequency
+    "sys_clk_freq":     48e6, # System clock frequency (DDR_clk = 4 x sys_clk)
+
+    # 0 if freq >64e6 else 100. https://github.com/enjoy-digital/litedram/issues/130
+    "cmd_delay": 100,
+
+    # Core ---------------------------------------------------------------------
+    "cmd_buffer_depth": 16,    # Depth of the command buffer
+
+    "dm_swap": true,
+
+    # User Ports ---------------------------------------------------------------
+    "user_ports": {
+        "native_0": {
+            "type": "native",
+            "block_until_ready": False,
+        },
+    },
+}
--- a/litedram/gen-src/sdram_init/Makefile
+++ b/litedram/gen-src/sdram_init/Makefile
@ -1,6 +1,5 @@
 #### Directories

-include variables.mak
 OBJ = $(BUILD_DIR)/obj

 LXINC_DIR=$(LXSRC_DIR)/include
@ -33,6 +32,7 @@ CPPFLAGS += -I$(LXSRC_DIR) -I$(LXINC_DIR) -I$(LXINC_DIR)/base -I$(LXSRC_DIR)/lib

 CPPFLAGS += -isystem $(shell $(CC) -print-file-name=include)
 CFLAGS = -Os -g -Wall -std=c99 -m64 -mabi=elfv2 -msoft-float -mno-string -mno-multiple -mno-vsx -mno-altivec -mlittle-endian -fno-stack-protector -mstrict-align -ffreestanding -fdata-sections -ffunction-sections -fno-delete-null-pointer-checks 
+CFLAGS += -Werror
 ASFLAGS = $(CPPFLAGS) $(CFLAGS)
 LDFLAGS = -static -nostdlib -T $(OBJ)/$(PROGRAM).lds --gc-sections

--- a/litedram/gen-src/sdram_init/main.c
+++ b/litedram/gen-src/sdram_init/main.c
@ -125,7 +125,7 @@ static bool check_flash(void)

 	/* Supported flash types for quad mode */
 	if (id[0] == 0x01 &&
-	    (id[1] == 0x02 || id[1] == 0x20) &&
+	    (id[1] == 0x02 || id[1] == 0x20 || id[1] == 0x60) &&
 	    (id[2] == 0x18 || id[2] == 0x19)) {
 		check_spansion_quad_mode();
 		quad = true;
@ -262,6 +262,8 @@ uint64_t main(void)
 		printf("SPIFLASH ");
 	if (ftr & SYS_REG_INFO_HAS_LITEETH)
 		printf("ETHERNET ");
+	if (ftr & SYS_REG_INFO_HAS_LITESDCARD)
+		printf("SDCARD ");
 	printf("\n");
 	if (ftr & SYS_REG_INFO_HAS_BRAM) {
 		val = readq(SYSCON_BASE + SYS_REG_BRAMINFO) & SYS_REG_BRAMINFO_SIZE_MASK;
@ -286,7 +288,7 @@ uint64_t main(void)
 	if (ftr & SYS_REG_INFO_HAS_DRAM) {
 		printf("LiteDRAM built from Migen %s and LiteX %s\n",
 		       MIGEN_GIT_SHA1, LITEX_GIT_SHA1);
-		sdrinit();
+		sdram_init();
 	}
 	if (ftr & SYS_REG_INFO_HAS_BRAM) {
 		printf("Booting from BRAM...\n");
--- a/litedram/gen-src/sim.yml
+++ b/litedram/gen-src/sim.yml
@ -3,14 +3,11 @@

 {
    # General ------------------------------------------------------------------
-    "cpu":        "None",  # Type of CPU used for init/calib (vexriscv, lm32)
-    "cpu_variant":"standard",
+    "cpu":        "None",  # CPU type (ex vexriscv, serv, None)
    "speedgrade": -1,          # FPGA speedgrade
    "memtype":    "DDR3",      # DRAM type
-    "sim" : "True",

    # PHY ----------------------------------------------------------------------
-    "cmd_delay":       0,             # Command additional delay (in taps)
    "cmd_latency":     0,             # Command additional latency
    "sdram_module":    "MT41K128M16", # SDRAM modules of the board or SO-DIMM
    "sdram_module_nb": 2,             # Number of byte groups
@ -34,10 +31,7 @@
    "user_ports": {
        "native_0": {
            "type": "native",
+            "block_until_ready": False,
        },
    },
-
-    # CSR Port -----------------------------------------------------------------
-    "csr_alignment"  : 32,
-    "csr_data_width" : 32,
 }
--- a/litedram/gen-src/wukong-v2.yml
+++ b/litedram/gen-src/wukong-v2.yml
@ -0,0 +1,37 @@
+# This file is Copyright (c) 2018-2019 Florent Kermarrec <florent@enjoy-digital.fr>
+# License: BSD
+
+{
+    # General ------------------------------------------------------------------
+    "cpu":        "None",  # CPU type (ex vexriscv, serv, None)
+    "speedgrade": -1,          # FPGA speedgrade
+    "memtype":    "DDR3",      # DRAM type
+
+    # PHY ----------------------------------------------------------------------
+    "cmd_latency":     0,             # Command additional latency
+    "sdram_module":    "MT41K128M16", # SDRAM modules of the board or SO-DIMM
+    "sdram_module_nb": 2,             # Number of byte groups
+    "sdram_rank_nb":   1,             # Number of ranks
+    "sdram_phy":       "A7DDRPHY",    # Type of FPGA PHY
+
+    # Electrical ---------------------------------------------------------------
+    "rtt_nom": "60ohm",  # Nominal termination
+    "rtt_wr":  "60ohm",  # Write termination
+    "ron":     "34ohm",  # Output driver impedance
+
+    # Frequency ----------------------------------------------------------------
+    "input_clk_freq":   50e6, # Input clock frequency
+    "sys_clk_freq":     100e6, # System clock frequency (DDR_clk = 4 x sys_clk)
+    "iodelay_clk_freq": 200e6, # IODELAYs reference clock frequency
+
+    # Core ---------------------------------------------------------------------
+    "cmd_buffer_depth": 16,    # Depth of the command buffer
+
+    # User Ports ---------------------------------------------------------------
+    "user_ports": {
+        "native_0": {
+            "type": "native",
+            "block_until_ready": False,
+        },
+    },
+}
--- a/litedram/generated/acorn-cle-215/litedram-initmem.vhdl
+++ b/litedram/generated/acorn-cle-215/litedram-initmem.vhdl
@ -100,7 +100,7 @@ begin
        if rising_edge(clk) then
            oack <= '0';
            if (wb_in.cyc and wb_in.stb) = '1' then
-                adr := to_integer((unsigned(wb_in.adr(INIT_RAM_ABITS-1 downto 2))));
+                adr := to_integer((unsigned(wb_in.adr(INIT_RAM_ABITS - 3 downto 0))));
                if wb_in.we = '0' then
                   obuf <= init_ram(adr);
                else
--- a/litedram/generated/acorn-cle-215/litedram_core.init
+++ b/litedram/generated/acorn-cle-215/litedram_core.init
--- a/litedram/generated/acorn-cle-215/litedram_core.v
+++ b/litedram/generated/acorn-cle-215/litedram_core.v
--- a/litedram/generated/arty/litedram-initmem.vhdl
+++ b/litedram/generated/arty/litedram-initmem.vhdl
@ -100,7 +100,7 @@ begin
        if rising_edge(clk) then
            oack <= '0';
            if (wb_in.cyc and wb_in.stb) = '1' then
-                adr := to_integer((unsigned(wb_in.adr(INIT_RAM_ABITS-1 downto 2))));
+                adr := to_integer((unsigned(wb_in.adr(INIT_RAM_ABITS - 3 downto 0))));
                if wb_in.we = '0' then
                   obuf <= init_ram(adr);
                else
--- a/litedram/generated/arty/litedram_core.init
+++ b/litedram/generated/arty/litedram_core.init
--- a/litedram/generated/arty/litedram_core.v
+++ b/litedram/generated/arty/litedram_core.v
--- a/litedram/generated/genesys2/litedram-initmem.vhdl
+++ b/litedram/generated/genesys2/litedram-initmem.vhdl
@ -100,7 +100,7 @@ begin
        if rising_edge(clk) then
            oack <= '0';
            if (wb_in.cyc and wb_in.stb) = '1' then
-                adr := to_integer((unsigned(wb_in.adr(INIT_RAM_ABITS-1 downto 2))));
+                adr := to_integer((unsigned(wb_in.adr(INIT_RAM_ABITS - 3 downto 0))));
                if wb_in.we = '0' then
                   obuf <= init_ram(adr);
                else
--- a/litedram/generated/genesys2/litedram_core.init
+++ b/litedram/generated/genesys2/litedram_core.init
--- a/litedram/generated/genesys2/litedram_core.v
+++ b/litedram/generated/genesys2/litedram_core.v
--- a/litedram/generated/nexys-video/litedram-initmem.vhdl
+++ b/litedram/generated/nexys-video/litedram-initmem.vhdl
@ -100,7 +100,7 @@ begin
        if rising_edge(clk) then
            oack <= '0';
            if (wb_in.cyc and wb_in.stb) = '1' then
-                adr := to_integer((unsigned(wb_in.adr(INIT_RAM_ABITS-1 downto 2))));
+                adr := to_integer((unsigned(wb_in.adr(INIT_RAM_ABITS - 3 downto 0))));
                if wb_in.we = '0' then
                   obuf <= init_ram(adr);
                else
--- a/litedram/generated/nexys-video/litedram_core.init
+++ b/litedram/generated/nexys-video/litedram_core.init
--- a/litedram/generated/nexys-video/litedram_core.v
+++ b/litedram/generated/nexys-video/litedram_core.v
--- a/litedram/generated/orangecrab-85-0.2/litedram-initmem.vhdl
+++ b/litedram/generated/orangecrab-85-0.2/litedram-initmem.vhdl
@ -0,0 +1,123 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+use std.textio.all;
+
+library work;
+use work.wishbone_types.all;
+use work.utils.all;
+
+entity dram_init_mem is
+    generic (
+        EXTRA_PAYLOAD_FILE : string   := "";
+        EXTRA_PAYLOAD_SIZE : integer  := 0
+        );
+    port (
+        clk     : in std_ulogic;
+        wb_in   : in wb_io_master_out;
+        wb_out  : out wb_io_slave_out
+      );
+end entity dram_init_mem;
+
+architecture rtl of dram_init_mem is
+
+    constant INIT_RAM_SIZE    : integer := 24576;
+    constant RND_PAYLOAD_SIZE : integer := round_up(EXTRA_PAYLOAD_SIZE, 8);
+    constant TOTAL_RAM_SIZE   : integer := INIT_RAM_SIZE + RND_PAYLOAD_SIZE;
+    constant INIT_RAM_ABITS   : integer := log2ceil(TOTAL_RAM_SIZE-1);
+    constant INIT_RAM_FILE    : string := "litedram_core.init";
+
+    type ram_t is array(0 to (TOTAL_RAM_SIZE / 4) - 1) of std_logic_vector(31 downto 0);
+
+    -- XXX FIXME: Have a single init function called twice with
+    -- an offset as argument
+    procedure init_load_payload(ram: inout ram_t; filename: string) is
+        file payload_file : text open read_mode is filename;
+        variable ram_line : line;
+        variable temp_word : std_logic_vector(63 downto 0);
+    begin
+        for i in 0 to RND_PAYLOAD_SIZE-1 loop
+            exit when endfile(payload_file);
+            readline(payload_file, ram_line);
+            hread(ram_line, temp_word);
+            ram((INIT_RAM_SIZE/4) + i*2) := temp_word(31 downto 0);
+            ram((INIT_RAM_SIZE/4) + i*2+1) := temp_word(63 downto 32);
+        end loop;
+        assert endfile(payload_file) report "Payload too big !" severity failure;
+    end procedure;
+
+    impure function init_load_ram(name : string) return ram_t is
+        file ram_file : text open read_mode is name;
+        variable temp_word : std_logic_vector(63 downto 0);
+        variable temp_ram : ram_t := (others => (others => '0'));
+        variable ram_line : line;
+    begin
+        report "Payload size:" & integer'image(EXTRA_PAYLOAD_SIZE) &
+            " rounded to:" & integer'image(RND_PAYLOAD_SIZE);
+        report "Total RAM size:" & integer'image(TOTAL_RAM_SIZE) &
+            " bytes using " & integer'image(INIT_RAM_ABITS) &
+            " address bits";
+        for i in 0 to (INIT_RAM_SIZE/8)-1 loop
+            exit when endfile(ram_file);
+            readline(ram_file, ram_line);
+            hread(ram_line, temp_word);
+            temp_ram(i*2) := temp_word(31 downto 0);
+            temp_ram(i*2+1) := temp_word(63 downto 32);
+        end loop;
+        if RND_PAYLOAD_SIZE /= 0 then
+            init_load_payload(temp_ram, EXTRA_PAYLOAD_FILE);
+        end if;
+        return temp_ram;
+    end function;
+
+    impure function init_zero return ram_t is
+        variable temp_ram : ram_t := (others => (others => '0'));
+    begin
+        return temp_ram;
+    end function;
+
+    impure function initialize_ram(filename: string) return ram_t is
+    begin
+        report "Opening file " & filename;
+        if filename'length = 0 then
+            return init_zero;
+        else
+            return init_load_ram(filename);
+        end if;
+    end function;
+    signal init_ram : ram_t := initialize_ram(INIT_RAM_FILE);
+
+    attribute ram_style : string;
+    attribute ram_style of init_ram: signal is "block";
+
+    signal obuf : std_ulogic_vector(31 downto 0);
+    signal oack : std_ulogic;
+begin
+
+    init_ram_0: process(clk)
+        variable adr  : integer;
+    begin
+        if rising_edge(clk) then
+            oack <= '0';
+            if (wb_in.cyc and wb_in.stb) = '1' then
+                adr := to_integer((unsigned(wb_in.adr(INIT_RAM_ABITS - 3 downto 0))));
+                if wb_in.we = '0' then
+                   obuf <= init_ram(adr);
+                else
+                    for i in 0 to 3 loop
+                        if wb_in.sel(i) = '1' then
+                            init_ram(adr)(((i + 1) * 8) - 1 downto i * 8) <=
+                                wb_in.dat(((i + 1) * 8) - 1 downto i * 8);
+                        end if;
+                    end loop;
+                end if;
+                oack <= '1';
+            end if;
+            wb_out.ack <= oack;
+            wb_out.dat <= obuf;
+        end if;
+    end process;
+
+    wb_out.stall <= '0';
+
+end architecture rtl;
--- a/litedram/generated/orangecrab-85-0.2/litedram_core.init
+++ b/litedram/generated/orangecrab-85-0.2/litedram_core.init
--- a/litedram/generated/orangecrab-85-0.2/litedram_core.v
+++ b/litedram/generated/orangecrab-85-0.2/litedram_core.v
--- a/litedram/generated/sim/litedram-initmem.vhdl
+++ b/litedram/generated/sim/litedram-initmem.vhdl
@ -100,7 +100,7 @@ begin
        if rising_edge(clk) then
            oack <= '0';
            if (wb_in.cyc and wb_in.stb) = '1' then
-                adr := to_integer((unsigned(wb_in.adr(INIT_RAM_ABITS-1 downto 2))));
+                adr := to_integer((unsigned(wb_in.adr(INIT_RAM_ABITS - 3 downto 0))));
                if wb_in.we = '0' then
                   obuf <= init_ram(adr);
                else
--- a/litedram/generated/sim/litedram_core.init
+++ b/litedram/generated/sim/litedram_core.init
--- a/litedram/generated/sim/litedram_core.v
+++ b/litedram/generated/sim/litedram_core.v
--- a/litedram/generated/wukong-v2/litedram-initmem.vhdl
+++ b/litedram/generated/wukong-v2/litedram-initmem.vhdl
@ -0,0 +1,123 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+use std.textio.all;
+
+library work;
+use work.wishbone_types.all;
+use work.utils.all;
+
+entity dram_init_mem is
+    generic (
+        EXTRA_PAYLOAD_FILE : string   := "";
+        EXTRA_PAYLOAD_SIZE : integer  := 0
+        );
+    port (
+        clk     : in std_ulogic;
+        wb_in   : in wb_io_master_out;
+        wb_out  : out wb_io_slave_out
+      );
+end entity dram_init_mem;
+
+architecture rtl of dram_init_mem is
+
+    constant INIT_RAM_SIZE    : integer := 24576;
+    constant RND_PAYLOAD_SIZE : integer := round_up(EXTRA_PAYLOAD_SIZE, 8);
+    constant TOTAL_RAM_SIZE   : integer := INIT_RAM_SIZE + RND_PAYLOAD_SIZE;
+    constant INIT_RAM_ABITS   : integer := log2ceil(TOTAL_RAM_SIZE-1);
+    constant INIT_RAM_FILE    : string := "litedram_core.init";
+
+    type ram_t is array(0 to (TOTAL_RAM_SIZE / 4) - 1) of std_logic_vector(31 downto 0);
+
+    -- XXX FIXME: Have a single init function called twice with
+    -- an offset as argument
+    procedure init_load_payload(ram: inout ram_t; filename: string) is
+        file payload_file : text open read_mode is filename;
+        variable ram_line : line;
+        variable temp_word : std_logic_vector(63 downto 0);
+    begin
+        for i in 0 to RND_PAYLOAD_SIZE-1 loop
+            exit when endfile(payload_file);
+            readline(payload_file, ram_line);
+            hread(ram_line, temp_word);
+            ram((INIT_RAM_SIZE/4) + i*2) := temp_word(31 downto 0);
+            ram((INIT_RAM_SIZE/4) + i*2+1) := temp_word(63 downto 32);
+        end loop;
+        assert endfile(payload_file) report "Payload too big !" severity failure;
+    end procedure;
+
+    impure function init_load_ram(name : string) return ram_t is
+        file ram_file : text open read_mode is name;
+        variable temp_word : std_logic_vector(63 downto 0);
+        variable temp_ram : ram_t := (others => (others => '0'));
+        variable ram_line : line;
+    begin
+        report "Payload size:" & integer'image(EXTRA_PAYLOAD_SIZE) &
+            " rounded to:" & integer'image(RND_PAYLOAD_SIZE);
+        report "Total RAM size:" & integer'image(TOTAL_RAM_SIZE) &
+            " bytes using " & integer'image(INIT_RAM_ABITS) &
+            " address bits";
+        for i in 0 to (INIT_RAM_SIZE/8)-1 loop
+            exit when endfile(ram_file);
+            readline(ram_file, ram_line);
+            hread(ram_line, temp_word);
+            temp_ram(i*2) := temp_word(31 downto 0);
+            temp_ram(i*2+1) := temp_word(63 downto 32);
+        end loop;
+        if RND_PAYLOAD_SIZE /= 0 then
+            init_load_payload(temp_ram, EXTRA_PAYLOAD_FILE);
+        end if;
+        return temp_ram;
+    end function;
+
+    impure function init_zero return ram_t is
+        variable temp_ram : ram_t := (others => (others => '0'));
+    begin
+        return temp_ram;
+    end function;
+
+    impure function initialize_ram(filename: string) return ram_t is
+    begin
+        report "Opening file " & filename;
+        if filename'length = 0 then
+            return init_zero;
+        else
+            return init_load_ram(filename);
+        end if;
+    end function;
+    signal init_ram : ram_t := initialize_ram(INIT_RAM_FILE);
+
+    attribute ram_style : string;
+    attribute ram_style of init_ram: signal is "block";
+
+    signal obuf : std_ulogic_vector(31 downto 0);
+    signal oack : std_ulogic;
+begin
+
+    init_ram_0: process(clk)
+        variable adr  : integer;
+    begin
+        if rising_edge(clk) then
+            oack <= '0';
+            if (wb_in.cyc and wb_in.stb) = '1' then
+                adr := to_integer((unsigned(wb_in.adr(INIT_RAM_ABITS - 3 downto 0))));
+                if wb_in.we = '0' then
+                   obuf <= init_ram(adr);
+                else
+                    for i in 0 to 3 loop
+                        if wb_in.sel(i) = '1' then
+                            init_ram(adr)(((i + 1) * 8) - 1 downto i * 8) <=
+                                wb_in.dat(((i + 1) * 8) - 1 downto i * 8);
+                        end if;
+                    end loop;
+                end if;
+                oack <= '1';
+            end if;
+            wb_out.ack <= oack;
+            wb_out.dat <= obuf;
+        end if;
+    end process;
+
+    wb_out.stall <= '0';
+
+end architecture rtl;
--- a/litedram/generated/wukong-v2/litedram_core.init
+++ b/litedram/generated/wukong-v2/litedram_core.init
--- a/litedram/generated/wukong-v2/litedram_core.v
+++ b/litedram/generated/wukong-v2/litedram_core.v
--- a/liteeth/gen-src/arty.yml
+++ b/liteeth/gen-src/arty.yml
@ -8,8 +8,9 @@ vendor:     xilinx
 clk_freq:   100e6
 core:       wishbone
 endianness: little
+ntxslots:   2
+nrxslots:   2

 soc:
    mem_map:
        ethmac: 0x00010000
-    csr_data_width: 32
--- a/Show More
+++ b/Show More