diff --git a/Makefile b/Makefile
index 096be56..b584895 100644
--- a/Makefile
+++ b/Makefile
@@ -58,7 +58,8 @@ uart_files = $(wildcard uart16550/*.v)
 
 soc_sim_files = $(soc_files) sim_console.vhdl sim_pp_uart.vhdl sim_bram_helpers.vhdl \
 	sim_bram.vhdl sim_jtag_socket.vhdl sim_jtag.vhdl dmi_dtm_xilinx.vhdl \
-	sim_16550_uart.vhdl
+	sim_16550_uart.vhdl \
+	random.vhdl glibc_random.vhdl glibc_random_helpers.vhdl
 
 soc_sim_c_files = sim_vhpi_c.c sim_bram_helpers_c.c sim_console_c.c \
 	sim_jtag_socket_c.c
@@ -177,7 +178,8 @@ toplevel=fpga/top-generic.vhdl
 dmi_dtm=dmi_dtm_dummy.vhdl
 
 fpga_files = $(core_files) $(soc_files) fpga/soc_reset.vhdl \
-	fpga/pp_fifo.vhd fpga/pp_soc_uart.vhd fpga/main_bram.vhdl
+	fpga/pp_fifo.vhd fpga/pp_soc_uart.vhd fpga/main_bram.vhdl \
+	nonrandom.vhdl
 
 synth_files = $(core_files) $(soc_files) $(fpga_files) $(clkgen) $(toplevel) $(dmi_dtm)
 
diff --git a/common.vhdl b/common.vhdl
index 28b3434..bd9210b 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -26,6 +26,7 @@ package common is
     constant SPR_XER    : spr_num_t := 1;
     constant SPR_LR     : spr_num_t := 8;
     constant SPR_CTR    : spr_num_t := 9;
+    constant SPR_TAR    : spr_num_t := 815;
     constant SPR_DSISR  : spr_num_t := 18;
     constant SPR_DAR    : spr_num_t := 19;
     constant SPR_TB     : spr_num_t := 268;
@@ -182,16 +183,25 @@ package common is
 	 is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0',
          byte_reverse => '0', sign_extend => '0', update => '0', nia => (others => '0'), read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'), cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'), others => (others => '0'));
 
-    type Execute1ToMultiplyType is record
+    type MultiplyInputType is record
 	valid: std_ulogic;
 	data1: std_ulogic_vector(63 downto 0);
 	data2: std_ulogic_vector(63 downto 0);
+        addend: std_ulogic_vector(127 downto 0);
 	is_32bit: std_ulogic;
-        neg_result: std_ulogic;
+        not_result: std_ulogic;
+    end record;
+    constant MultiplyInputInit : MultiplyInputType := (valid => '0',
+                                                       is_32bit => '0', not_result => '0',
+                                                       others => (others => '0'));
+
+    type MultiplyOutputType is record
+	valid: std_ulogic;
+	result: std_ulogic_vector(127 downto 0);
+        overflow : std_ulogic;
     end record;
-    constant Execute1ToMultiplyInit : Execute1ToMultiplyType := (valid => '0',
-								 is_32bit => '0', neg_result => '0',
-								 others => (others => '0'));
+    constant MultiplyOutputInit : MultiplyOutputType := (valid => '0', overflow => '0',
+                                                         others => (others => '0'));
 
     type Execute1ToDividerType is record
 	valid: std_ulogic;
@@ -382,14 +392,6 @@ package common is
                                    write_cr_data => (others => '0'), write_reg => (others => '0'),
                                    exc_write_reg => (others => '0'), exc_write_data => (others => '0'));
 
-    type MultiplyToExecute1Type is record
-	valid: std_ulogic;
-	result: std_ulogic_vector(127 downto 0);
-        overflow : std_ulogic;
-    end record;
-    constant MultiplyToExecute1Init : MultiplyToExecute1Type := (valid => '0', overflow => '0',
-								 others => (others => '0'));
-
     type DividerToExecute1Type is record
 	valid: std_ulogic;
 	write_reg_data: std_ulogic_vector(63 downto 0);
@@ -458,6 +460,8 @@ package body common is
            n := 11;
        when SPR_XER =>
            n := 12;
+       when SPR_TAR =>
+           n := 13;
        when others =>
            n := 0;
            return "000000";
diff --git a/decode1.vhdl b/decode1.vhdl
index f553e2d..21fea4a 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -34,6 +34,8 @@ architecture behaviour of decode1 is
     subtype major_opcode_t is unsigned(5 downto 0);
     type major_rom_array_t is array(0 to 63) of decode_rom_t;
     type minor_valid_array_t is array(0 to 1023) of std_ulogic;
+    type minor_valid_array_2t is array(0 to 2047) of std_ulogic;
+    type op_4_subop_array_t is array(0 to 63) of decode_rom_t;
     type op_19_subop_array_t is array(0 to 7) of decode_rom_t;
     type op_30_subop_array_t is array(0 to 15) of decode_rom_t;
     type op_31_subop_array_t is array(0 to 1023) of decode_rom_t;
@@ -85,6 +87,24 @@ architecture behaviour of decode1 is
         others   => illegal_inst
         );
 
+    -- indexed by bits 5..0 and 10..6 of instruction word
+    constant decode_op_4_valid : minor_valid_array_2t := (
+        2#11000000000# to 2#11000011111# => '1',        -- maddhd
+        2#11000100000# to 2#11000111111# => '1',        -- maddhdu
+        2#11001100000# to 2#11001111111# => '1',        -- maddld
+        others => '0'
+        );
+
+    -- indexed by bits 5..0 of instruction word
+    constant decode_op_4_array : op_4_subop_array_t := (
+        --                   unit    internal      in1         in2          in3   out   CR   CR   inv  inv  cry   cry  ldst  BR   sgn  upd  rsrv 32b  sgn  rc    lk   sgl
+        --                                op                                            in   out   A   out  in    out  len        ext                                 pipe
+        2#110000#  =>       (ALU,    OP_MUL_H64,   RA,         RB,          RCR,  RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- maddhd
+        2#110001#  =>       (ALU,    OP_MUL_H64,   RA,         RB,          RCR,  RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- maddhdu
+        2#110011#  =>       (ALU,    OP_MUL_L64,   RA,         RB,          RCR,  RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC,   '0', '0'), -- maddld
+        others   => decode_rom_init
+        );
+
     -- indexed by bits 10..1 of instruction word
     constant decode_op_19_valid : minor_valid_array_t := (
         -- addpcis, 5 upper bits are part of constant
@@ -94,7 +114,7 @@ architecture behaviour of decode1 is
         2#1100000010# => '1', 2#1100100010# => '1', 2#1101000010# => '1', 2#1101100010# => '1', 2#1110000010# => '1', 2#1110100010# => '1', 2#1111000010# => '1', 2#1111100010# => '1',
         2#1000010000# => '1', -- bcctr
         2#0000010000# => '1', -- bclr
-        2#1000110000# => '0', -- bctar
+        2#1000110000# => '1', -- bctar
         2#0100000001# => '1', -- crand
         2#0010000001# => '1', -- crandc
         2#0100100001# => '1', -- creqv
@@ -152,23 +172,27 @@ architecture behaviour of decode1 is
         2#1000001010#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addco
         2#0010001010#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- adde
         2#1010001010#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addeo
+        2#0010101010#  =>       (ALU,    OP_ADD,       RA,         RB,          NONE, RT,   '0', '0', '0', '0', OV,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addex
+        2#0001001010#  =>       (ALU,    OP_ADDG6S,    RA,         RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- addg6s
         2#0011101010#  =>       (ALU,    OP_ADD,       RA,         CONST_M1,    NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addme
         2#1011101010#  =>       (ALU,    OP_ADD,       RA,         CONST_M1,    NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addmeo
         2#0011001010#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addze
         2#1011001010#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '0', '0', CA,   '1', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- addzeo
         2#0000011100#  =>       (ALU,    OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- and
         2#0000111100#  =>       (ALU,    OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- andc
-        -- 2#0011111100# bperm
+        2#0011111100#  =>       (ALU,    OP_BPERM,     NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- bperm
+        2#0100111010#  =>       (ALU,    OP_BCD,       NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cbcdtd
+        2#0100011010#  =>       (ALU,    OP_BCD,       NONE,       NONE,        RS,   RA,   '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cdtbcd
         2#0000000000#  =>       (ALU,    OP_CMP,       RA,         RB,          NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmp
         2#0111111100#  =>       (ALU,    OP_CMPB,      NONE,       RB,          RS,   RA,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpb
-        -- 2#0011100000# cmpeqb
+        2#0011100000#  =>       (ALU,    OP_CMPEQB,    RA,         RB,          NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpeqb
         2#0000100000#  =>       (ALU,    OP_CMP,       RA,         RB,          NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpl
-        -- 2#0011000000# cmprb
+        2#0011000000#  =>       (ALU,    OP_CMPRB,     RA,         RB,          NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmprb
         2#0000111010#  =>       (ALU,    OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- cntlzd
         2#0000011010#  =>       (ALU,    OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- cntlzw
         2#1000111010#  =>       (ALU,    OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- cnttzd
         2#1000011010#  =>       (ALU,    OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- cnttzw
-        -- 2#1011110011# darn
+        2#1011110011#  =>       (ALU,    OP_DARN,      NONE,       NONE,        NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- darn
         2#0001010110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbf
         2#0000110110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbst
         2#0100010110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbt
@@ -254,8 +278,7 @@ architecture behaviour of decode1 is
         2#1100010101#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwzcix
         2#0000110111#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lwzux
         2#0000010111#  =>       (LDST,   OP_LOAD,      RA_OR_ZERO, RB,          NONE, RT,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lwzx
-        -- 2#1000000000# mcrxr
-        -- 2#1001000000# mcrxrx
+        2#1001000000#  =>       (ALU,    OP_MCRXRX,    NONE,       NONE,        NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mcrxrx
         2#0000010011#  =>       (ALU,    OP_MFCR,      NONE,       NONE,        NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfcr/mfocrf
         2#0001010011#  =>       (ALU,    OP_MFMSR,     NONE,       NONE,        NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- mfmsr
         2#0101010011#  =>       (ALU,    OP_MFSPR,     SPR,        NONE,        RS,   RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- mfspr
@@ -282,6 +305,15 @@ architecture behaviour of decode1 is
         2#0111011100#  =>       (ALU,    OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- nand
         2#0001101000#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- neg
         2#1001101000#  =>       (ALU,    OP_ADD,       RA,         NONE,        NONE, RT,   '0', '0', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- nego
+        -- next 8 are reserved no-op instructions
+        2#1000010010#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop
+        2#1000110010#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop
+        2#1001010010#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop
+        2#1001110010#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop
+        2#1010010010#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop
+        2#1010110010#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop
+        2#1011010010#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop
+        2#1011110010#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- nop
         2#0001111100#  =>       (ALU,    OP_OR,        NONE,       RB,          RS,   RA,   '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- nor
         2#0110111100#  =>       (ALU,    OP_OR,        NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- or
         2#0110011100#  =>       (ALU,    OP_OR,        NONE,       RB,          RS,   RA,   '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- orc
@@ -290,7 +322,7 @@ architecture behaviour of decode1 is
         2#0101111010#  =>       (ALU,    OP_POPCNT,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntw
         2#0010111010#  =>       (ALU,    OP_PRTY,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyd
         2#0010011010#  =>       (ALU,    OP_PRTY,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyw
-        -- 2#0010000000# setb
+        2#0010000000#  =>       (ALU,    OP_SETB,      NONE,       NONE,        NONE, RT,   '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- setb
         2#0111110010#  =>       (LDST,   OP_TLBIE,     NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- slbia
         2#0000011011#  =>       (ALU,    OP_SHL,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- sld
         2#0000011000#  =>       (ALU,    OP_SHL,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- slw
@@ -335,6 +367,7 @@ architecture behaviour of decode1 is
         2#0000000100#  =>       (ALU,    OP_TRAP,      RA,         RB,          NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1'), -- tw
         2#0100110010#  =>       (LDST,   OP_TLBIE,     NONE,       RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- tlbie
         2#0100010010#  =>       (LDST,   OP_TLBIE,     NONE,       RB,          RS,   NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- tlbiel
+        2#0000011110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- wait
         2#0100111100#  =>       (ALU,    OP_XOR,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- xor
         others => illegal_inst
 	);
@@ -391,6 +424,7 @@ begin
         variable v : Decode1ToDecode2Type;
         variable f : Decode1ToFetch1Type;
         variable majorop : major_opcode_t;
+        variable minor4op : std_ulogic_vector(10 downto 0);
         variable op_19_bits: std_ulogic_vector(2 downto 0);
         variable sprn : spr_num_t;
         variable br_nia    : std_ulogic_vector(61 downto 0);
@@ -419,6 +453,15 @@ begin
             end if;
             v.decode := fetch_fail_inst;
 
+        elsif majorop = "000100" then
+            -- major opcode 4, mostly VMX/VSX stuff but also some integer ops (madd*)
+            minor4op := f_in.insn(5 downto 0) & f_in.insn(10 downto 6);
+            if decode_op_4_valid(to_integer(unsigned(minor4op))) = '1' then
+                v.decode := decode_op_4_array(to_integer(unsigned(f_in.insn(5 downto 0))));
+            else
+                v.decode := illegal_inst;
+            end if;
+
         elsif majorop = "011111" then
             -- major opcode 31, lots of things
             v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1))));
@@ -467,11 +510,12 @@ begin
                 if f_in.insn(23) = '0' then
                     v.ispr1 := fast_spr_num(SPR_CTR);
                 end if;
-                -- TODO: Add TAR
                 if f_in.insn(10) = '0' then
                     v.ispr2 := fast_spr_num(SPR_LR);
-                else
+                elsif f_in.insn(6) = '0' then
                     v.ispr2 := fast_spr_num(SPR_CTR);
+                else
+                    v.ispr2 := fast_spr_num(SPR_TAR);
                 end if;
             else
                 -- Could be OP_RFID
diff --git a/decode2.vhdl b/decode2.vhdl
index 62c574c..b1531f1 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -135,6 +135,8 @@ architecture behaviour of decode2 is
         case t is
             when RS =>
                 return ('1', gpr_to_gspr(insn_rs(insn_in)), reg_data);
+            when RCR =>
+                return ('1', gpr_to_gspr(insn_rcreg(insn_in)), reg_data);
             when NONE =>
                 return ('0', (others => '0'), (others => '0'));
         end case;
@@ -282,7 +284,8 @@ begin
                        else gpr_to_gspr(insn_ra(d_in.insn));
     r_out.read2_reg <= d_in.ispr2 when d_in.decode.input_reg_b = SPR
                        else gpr_to_gspr(insn_rb(d_in.insn));
-    r_out.read3_reg <= insn_rs(d_in.insn);
+    r_out.read3_reg <= insn_rcreg(d_in.insn) when d_in.decode.input_reg_c = RCR
+                       else insn_rs(d_in.insn);
 
     c_out.read <= d_in.decode.input_cr;
 
diff --git a/decode_types.vhdl b/decode_types.vhdl
index 9cd6d69..ef654c3 100644
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@@ -9,8 +9,8 @@ package decode_types is
 			 OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST,
 			 OP_DCBZ, OP_DIV, OP_DIVE, OP_EXTS,
 			 OP_EXTSWSLI, OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC,
-			 OP_LOAD, OP_STORE, OP_MADDHD, OP_MADDHDU, OP_MADDLD,
-			 OP_MCRXR, OP_MCRXRX, OP_MFCR, OP_MFMSR, OP_MFSPR, OP_MOD,
+			 OP_LOAD, OP_STORE,
+			 OP_MCRXRX, OP_MFCR, OP_MFMSR, OP_MFSPR, OP_MOD,
 			 OP_MTCRF, OP_MTMSRD, OP_MTSPR, OP_MUL_L64,
 			 OP_MUL_H64, OP_MUL_H32, OP_OR,
 			 OP_POPCNT, OP_PRTY, OP_RFID,
@@ -18,15 +18,16 @@ package decode_types is
 			 OP_SHL, OP_SHR,
 			 OP_SYNC, OP_TLBIE, OP_TRAP,
 			 OP_XOR,
+                         OP_BCD, OP_ADDG6S,
                          OP_FETCH_FAILED
 			 );
     type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA);
     type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD,
                            CONST_DXHI4, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR);
-    type input_reg_c_t is (NONE, RS);
+    type input_reg_c_t is (NONE, RS, RCR);
     type output_reg_a_t is (NONE, RT, RA, SPR);
     type rc_t is (NONE, ONE, RC);
-    type carry_in_t is (ZERO, CA, ONE);
+    type carry_in_t is (ZERO, CA, OV, ONE);
 
     constant SH_OFFSET : integer := 0;
     constant MB_OFFSET : integer := 1;
diff --git a/execute1.vhdl b/execute1.vhdl
index 2722570..1b83997 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -56,6 +56,7 @@ architecture behaviour of execute1 is
 	lr_update : std_ulogic;
 	next_lr : std_ulogic_vector(63 downto 0);
 	mul_in_progress : std_ulogic;
+        mul_finish : std_ulogic;
         div_in_progress : std_ulogic;
         cntz_in_progress : std_ulogic;
         slow_op_insn : insn_type_t;
@@ -69,7 +70,7 @@ architecture behaviour of execute1 is
     constant reg_type_init : reg_type :=
         (e => Execute1ToWritebackInit, f => Execute1ToFetch1Init,
          busy => '0', lr_update => '0', terminate => '0',
-         mul_in_progress => '0', div_in_progress => '0', cntz_in_progress => '0',
+         mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0',
          slow_op_insn => OP_ILLEGAL, slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init,
          next_lr => (others => '0'), last_nia => (others => '0'), others => (others => '0'));
 
@@ -89,13 +90,18 @@ architecture behaviour of execute1 is
     signal countzero_result: std_ulogic_vector(63 downto 0);
 
     -- multiply signals
-    signal x_to_multiply: Execute1ToMultiplyType;
-    signal multiply_to_x: MultiplyToExecute1Type;
+    signal x_to_multiply: MultiplyInputType;
+    signal multiply_to_x: MultiplyOutputType;
 
     -- divider signals
     signal x_to_divider: Execute1ToDividerType;
     signal divider_to_x: DividerToExecute1Type;
 
+    -- random number generator signals
+    signal random_raw  : std_ulogic_vector(63 downto 0);
+    signal random_cond : std_ulogic_vector(63 downto 0);
+    signal random_err  : std_ulogic;
+
     -- signals for logging
     signal exception_log : std_ulogic;
     signal irq_valid_log : std_ulogic;
@@ -158,6 +164,8 @@ architecture behaviour of execute1 is
 	    return '0';
 	when CA =>
 	    return xerc.ca;
+        when OV =>
+            return xerc.ov;
 	when ONE =>
 	    return '1';
 	end case;
@@ -184,6 +192,11 @@ architecture behaviour of execute1 is
 	return msr_out;
     end;
 
+    -- Tell vivado to keep the hierarchy for the random module so that the
+    -- net names in the xdc file match.
+    attribute keep_hierarchy : string;
+    attribute keep_hierarchy of random_0 : label is "yes";
+
 begin
 
     rotator_0: entity work.rotator
@@ -237,6 +250,14 @@ begin
             d_out => divider_to_x
             );
 
+    random_0: entity work.random
+        port map (
+            clk => clk,
+            data => random_cond,
+            raw => random_raw,
+            err => random_err
+            );
+
     dbg_msr_out <= ctrl.msr;
     log_rd_addr <= r.log_addr_spr;
 
@@ -274,7 +295,7 @@ begin
 	variable a_inv : std_ulogic_vector(63 downto 0);
 	variable result : std_ulogic_vector(63 downto 0);
 	variable newcrf : std_ulogic_vector(3 downto 0);
-	variable result_with_carry : std_ulogic_vector(64 downto 0);
+	variable sum_with_carry : std_ulogic_vector(64 downto 0);
 	variable result_en : std_ulogic;
 	variable crnum : crnum_t;
 	variable crbit : integer range 0 to 31;
@@ -308,9 +329,10 @@ begin
         variable taken_branch : std_ulogic;
         variable abs_branch : std_ulogic;
         variable spr_val : std_ulogic_vector(63 downto 0);
+        variable addend : std_ulogic_vector(127 downto 0);
     begin
 	result := (others => '0');
-	result_with_carry := (others => '0');
+	sum_with_carry := (others => '0');
 	result_en := '0';
 	newcrf := (others => '0');
         is_branch := '0';
@@ -371,6 +393,16 @@ begin
 	v.mul_in_progress := '0';
         v.div_in_progress := '0';
         v.cntz_in_progress := '0';
+        v.mul_finish := '0';
+
+        -- Main adder
+        if e_in.invert_a = '0' then
+            a_inv := a_in;
+        else
+            a_inv := not a_in;
+        end if;
+        sum_with_carry := ppc_adde(a_inv, b_in,
+                                   decode_input_carry(e_in.input_carry, v.e.xerc));
 
         -- signals to multiply and divide units
         sign1 := '0';
@@ -396,7 +428,7 @@ begin
             abs2 := - signed(b_in);
         end if;
 
-	x_to_multiply <= Execute1ToMultiplyInit;
+	x_to_multiply <= MultiplyInputInit;
 	x_to_multiply.is_32bit <= e_in.is_32bit;
 
         x_to_divider <= Execute1ToDividerInit;
@@ -406,7 +438,20 @@ begin
             x_to_divider.is_modulus <= '1';
         end if;
 
-        x_to_multiply.neg_result <= sign1 xor sign2;
+        addend := (others => '0');
+        if e_in.insn(26) = '0' then
+            -- integer multiply-add, major op 4 (if it is a multiply)
+            addend(63 downto 0) := c_in;
+            if e_in.is_signed = '1' then
+                addend(127 downto 64) := (others => c_in(63));
+            end if;
+        end if;
+        if (sign1 xor sign2) = '1' then
+            addend := not addend;
+        end if;
+
+        x_to_multiply.not_result <= sign1 xor sign2;
+        x_to_multiply.addend <= addend;
         x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus);
         if e_in.is_32bit = '0' then
             -- 64-bit forms
@@ -548,24 +593,23 @@ begin
 	    when OP_NOP =>
 		-- Do nothing
 	    when OP_ADD | OP_CMP | OP_TRAP =>
-		if e_in.invert_a = '0' then
-		    a_inv := a_in;
-		else
-		    a_inv := not a_in;
-		end if;
-		result_with_carry := ppc_adde(a_inv, b_in,
-					      decode_input_carry(e_in.input_carry, v.e.xerc));
-		result := result_with_carry(63 downto 0);
+		result := sum_with_carry(63 downto 0);
                 carry_32 := result(32) xor a_inv(32) xor b_in(32);
-                carry_64 := result_with_carry(64);
+                carry_64 := sum_with_carry(64);
                 if e_in.insn_type = OP_ADD then
                     if e_in.output_carry = '1' then
-                        set_carry(v.e, carry_32, carry_64);
+                        if e_in.input_carry /= OV then
+                            set_carry(v.e, carry_32, carry_64);
+                        else
+                            v.e.xerc.ov := carry_64;
+                            v.e.xerc.ov32 := carry_32;
+                            v.e.write_xerc_enable := '1';
+                        end if;
                     end if;
                     if e_in.oe = '1' then
                         set_ov(v.e,
-                               calc_ov(a_inv(63), b_in(63), carry_64, result_with_carry(63)),
-                               calc_ov(a_inv(31), b_in(31), carry_32, result_with_carry(31)));
+                               calc_ov(a_inv(63), b_in(63), carry_64, sum_with_carry(63)),
+                               calc_ov(a_inv(31), b_in(31), carry_32, sum_with_carry(31)));
                     end if;
                     result_en := '1';
                 else
@@ -630,7 +674,37 @@ begin
                         end if;
                     end if;
                 end if;
-	    when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS =>
+            when OP_ADDG6S =>
+                result := (others => '0');
+                for i in 0 to 14 loop
+                    lo := i * 4;
+                    hi := (i + 1) * 4;
+                    if (a_in(hi) xor b_in(hi) xor sum_with_carry(hi)) = '0' then
+                        result(lo + 3 downto lo) := "0110";
+                    end if;
+                end loop;
+                if sum_with_carry(64) = '0' then
+                    result(63 downto 60) := "0110";
+                end if;
+                result_en := '1';
+            when OP_CMPRB =>
+                newcrf := ppc_cmprb(a_in, b_in, insn_l(e_in.insn));
+                bf := insn_bf(e_in.insn);
+                crnum := to_integer(unsigned(bf));
+                v.e.write_cr_enable := '1';
+                v.e.write_cr_mask := num_to_fxm(crnum);
+                v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf &
+                                     newcrf & newcrf & newcrf & newcrf;
+            when OP_CMPEQB =>
+                newcrf := ppc_cmpeqb(a_in, b_in);
+                bf := insn_bf(e_in.insn);
+                crnum := to_integer(unsigned(bf));
+                v.e.write_cr_enable := '1';
+                v.e.write_cr_mask := num_to_fxm(crnum);
+                v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf &
+                                     newcrf & newcrf & newcrf & newcrf;
+            when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS |
+                    OP_BPERM | OP_BCD =>
 		result := logical_result;
 		result_en := '1';
 	    when OP_B =>
@@ -736,6 +810,28 @@ begin
 			end if;
 		    end loop;
 		end if;
+            when OP_MCRXRX =>
+                newcrf := v.e.xerc.ov & v.e.xerc.ca & v.e.xerc.ov32 & v.e.xerc.ca32;
+                bf := insn_bf(e_in.insn);
+                crnum := to_integer(unsigned(bf));
+                v.e.write_cr_enable := '1';
+                v.e.write_cr_mask := num_to_fxm(crnum);
+                v.e.write_cr_data := newcrf & newcrf & newcrf & newcrf &
+                                     newcrf & newcrf & newcrf & newcrf;
+            when OP_DARN =>
+                if random_err = '0' then
+                    case e_in.insn(17 downto 16) is
+                        when "00" =>
+                            result := x"00000000" & random_cond(31 downto 0);
+                        when "10" =>
+                            result := random_raw;
+                        when others =>
+                            result := random_cond;
+                    end case;
+                else
+                    result := (others => '1');
+                end if;
+                result_en := '1';
 	    when OP_MFMSR =>
 		result := ctrl.msr;
 		result_en := '1';
@@ -864,6 +960,15 @@ begin
 		    set_carry(v.e, rotator_carry, rotator_carry);
 		end if;
 		result_en := '1';
+            when OP_SETB =>
+                bfa := insn_bfa(e_in.insn);
+                crbit := to_integer(unsigned(bfa)) * 4;
+                result := (others => '0');
+                if cr_in(31 - crbit) = '1' then
+                    result := (others => '1');
+                elsif cr_in(30 - crbit) = '1' then
+                    result(0) := '1';
+                end if;
 
 	    when OP_ISYNC =>
 		v.f.redirect := '1';
@@ -946,9 +1051,9 @@ begin
             -- cnt[lt]z always takes two cycles
             result := countzero_result;
             result_en := '1';
-            v.e.write_reg := gpr_to_gspr(v.slow_op_dest);
-            v.e.rc := v.slow_op_rc;
-            v.e.xerc := v.slow_op_xerc;
+            v.e.write_reg := gpr_to_gspr(r.slow_op_dest);
+            v.e.rc := r.slow_op_rc;
+            v.e.xerc := r.slow_op_xerc;
             v.e.valid := '1';
 	elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then
 	    if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or
@@ -964,31 +1069,47 @@ begin
                         when others =>
                             -- i.e. OP_MUL_L64
                             result := multiply_to_x.result(63 downto 0);
-                            overflow := multiply_to_x.overflow;
                     end case;
 		else
 		    result := divider_to_x.write_reg_data;
 		    overflow := divider_to_x.overflow;
 		end if;
-		result_en := '1';
-		v.e.write_reg := gpr_to_gspr(v.slow_op_dest);
-		v.e.rc := v.slow_op_rc;
-		v.e.xerc := v.slow_op_xerc;
-		v.e.write_xerc_enable := v.slow_op_oe;
-		-- We must test oe because the RC update code in writeback
-		-- will use the xerc value to set CR0:SO so we must not clobber
-		-- xerc if OE wasn't set.
-		if v.slow_op_oe = '1' then
-		    v.e.xerc.ov := overflow;
-		    v.e.xerc.ov32 := overflow;
-		    v.e.xerc.so := v.slow_op_xerc.so or overflow;
-		end if;
-		v.e.valid := '1';
+                if r.mul_in_progress = '1' and r.slow_op_oe = '1' then
+                    -- have to wait until next cycle for overflow indication
+                    v.mul_finish := '1';
+                    v.busy := '1';
+                else
+                    result_en := '1';
+                    v.e.write_reg := gpr_to_gspr(r.slow_op_dest);
+                    v.e.rc := r.slow_op_rc;
+                    v.e.xerc := r.slow_op_xerc;
+                    v.e.write_xerc_enable := r.slow_op_oe;
+                    -- We must test oe because the RC update code in writeback
+                    -- will use the xerc value to set CR0:SO so we must not clobber
+                    -- xerc if OE wasn't set.
+                    if r.slow_op_oe = '1' then
+                        v.e.xerc.ov := overflow;
+                        v.e.xerc.ov32 := overflow;
+                        v.e.xerc.so := r.slow_op_xerc.so or overflow;
+                    end if;
+                    v.e.valid := '1';
+                end if;
 	    else
 		v.busy := '1';
 		v.mul_in_progress := r.mul_in_progress;
 		v.div_in_progress := r.div_in_progress;
 	    end if;
+        elsif r.mul_finish = '1' then
+            result := r.e.write_data;
+            result_en := '1';
+            v.e.write_reg := gpr_to_gspr(r.slow_op_dest);
+            v.e.rc := r.slow_op_rc;
+            v.e.xerc := r.slow_op_xerc;
+            v.e.write_xerc_enable := r.slow_op_oe;
+            v.e.xerc.ov := multiply_to_x.overflow;
+            v.e.xerc.ov32 := multiply_to_x.overflow;
+            v.e.xerc.so := r.slow_op_xerc.so or multiply_to_x.overflow;
+            v.e.valid := '1';
 	end if;
 
         if illegal = '1' then
diff --git a/fpga/fpga-random.vhdl b/fpga/fpga-random.vhdl
new file mode 100644
index 0000000..7897c05
--- /dev/null
+++ b/fpga/fpga-random.vhdl
@@ -0,0 +1,53 @@
+-- Random number generator for Microwatt
+-- Based on https://pdfs.semanticscholar.org/83ac/9e9c1bb3dad5180654984604c8d5d8137412.pdf
+-- "High Speed True Random Number Generators in Xilinx FPGAs"
+-- by Catalin Baetoniu, Xilinx Inc.
+
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+
+entity random is
+    port (
+        clk  : in std_ulogic;
+        data : out std_ulogic_vector(63 downto 0);
+        raw  : out std_ulogic_vector(63 downto 0);
+        err  : out std_ulogic
+        );
+end entity random;
+
+architecture behaviour of random is
+    signal ringosc : std_ulogic_vector(63 downto 0);
+    signal ro_reg  : std_ulogic_vector(63 downto 0);
+    signal lhca    : std_ulogic_vector(63 downto 0);
+
+    constant lhca_diag : std_ulogic_vector(63 downto 0) := x"fffffffffffffffb";
+
+begin
+    random_osc : process(all)
+    begin
+        -- chaotic set of ring oscillators
+        ringosc(0) <= ringosc(63) xor ringosc(0) xor ringosc(1);
+        for i in 1 to 62 loop
+            ringosc(i) <= ringosc(i-1) xor ringosc(i) xor ringosc(i+1);
+        end loop;
+        ringosc(63) <= not (ringosc(62) xor ringosc(63) xor ringosc(0));
+    end process;
+
+    lhca_update : process(clk)
+    begin
+        if rising_edge(clk) then
+            ro_reg <= ringosc;
+            raw <= ro_reg;
+            -- linear hybrid cellular automaton
+            -- used to even out the statistics of the ring oscillators
+            lhca <= ('0' & lhca(63 downto 1)) xor (lhca and lhca_diag) xor
+                    (lhca(62 downto 0) & '0') xor ro_reg;
+        end if;
+    end process;
+
+    data <= lhca;
+    err <= '0';
+end behaviour;
diff --git a/fpga/fpga-random.xdc b/fpga/fpga-random.xdc
new file mode 100644
index 0000000..ba69f87
--- /dev/null
+++ b/fpga/fpga-random.xdc
@@ -0,0 +1,3 @@
+set_property ALLOW_COMBINATORIAL_LOOPS TRUE [get_nets soc0/processor/execute1_0/random_0/ro_reg*]
+set_property ALLOW_COMBINATORIAL_LOOPS TRUE [get_nets soc0/processor/execute1_0/random_0/p_*]
+set_property ALLOW_COMBINATORIAL_LOOPS TRUE [get_nets soc0/processor/execute1_0/random_0/D*]
diff --git a/insn_helpers.vhdl b/insn_helpers.vhdl
index acd2f72..592acb0 100644
--- a/insn_helpers.vhdl
+++ b/insn_helpers.vhdl
@@ -6,6 +6,7 @@ package insn_helpers is
     function insn_rt (insn_in : std_ulogic_vector) return std_ulogic_vector;
     function insn_ra (insn_in : std_ulogic_vector) return std_ulogic_vector;
     function insn_rb (insn_in : std_ulogic_vector) return std_ulogic_vector;
+    function insn_rcreg (insn_in : std_ulogic_vector) return std_ulogic_vector;
     function insn_si (insn_in : std_ulogic_vector) return std_ulogic_vector;
     function insn_ui (insn_in : std_ulogic_vector) return std_ulogic_vector;
     function insn_l (insn_in : std_ulogic_vector) return std_ulogic;
@@ -59,6 +60,11 @@ package body insn_helpers is
         return insn_in(15 downto 11);
     end;
 
+    function insn_rcreg (insn_in : std_ulogic_vector) return std_ulogic_vector is
+    begin
+        return insn_in(10 downto 6);
+    end;
+
     function insn_si (insn_in : std_ulogic_vector) return std_ulogic_vector is
     begin
         return insn_in(15 downto 0);
diff --git a/logical.vhdl b/logical.vhdl
index 0f53544..d008e47 100644
--- a/logical.vhdl
+++ b/logical.vhdl
@@ -35,11 +35,79 @@ architecture behaviour of logical is
     signal par0, par1 : std_ulogic;
     signal popcnt   : std_ulogic_vector(63 downto 0);
     signal parity   : std_ulogic_vector(63 downto 0);
+    signal permute  : std_ulogic_vector(7 downto 0);
+
+    function bcd_to_dpd(bcd: std_ulogic_vector(11 downto 0)) return std_ulogic_vector is
+        variable dpd: std_ulogic_vector(9 downto 0);
+        variable a, b, c, d, e, f, g, h, i, j, k, m: std_ulogic;
+    begin
+        -- The following equations are copied from PowerISA v3.0B Book 1 appendix B
+        a := bcd(11);
+        b := bcd(10);
+        c := bcd(9);
+        d := bcd(8);
+        e := bcd(7);
+        f := bcd(6);
+        g := bcd(5);
+        h := bcd(4);
+        i := bcd(3);
+        j := bcd(2);
+        k := bcd(1);
+        m := bcd(0);
+        dpd(9) := (f and a and i and not e) or (j and a and not i) or (b and not a);
+        dpd(8) := (g and a and i and not e) or (k and a and not i) or (c and not a);
+        dpd(7) := d;
+        dpd(6) := (j and not a and e and not i) or (f and not i and not e) or
+                  (f and not a and not e) or (e and i);
+        dpd(5) := (k and not a and e and not i) or (g and not i and not e) or
+                  (g and not a and not e) or (a and i);
+        dpd(4) := h;
+        dpd(3) := a or e or i;
+        dpd(2) := (not e and j and not i) or (e and i) or a;
+        dpd(1) := (not a and k and not i) or (a and i) or e;
+        dpd(0) := m;
+        return dpd;
+    end;
+
+    function dpd_to_bcd(dpd: std_ulogic_vector(9 downto 0)) return std_ulogic_vector is
+        variable bcd: std_ulogic_vector(11 downto 0);
+        variable p, q, r, s, t, u, v, w, x, y: std_ulogic;
+    begin
+        -- The following equations are copied from PowerISA v3.0B Book 1 appendix B
+        p := dpd(9);
+        q := dpd(8);
+        r := dpd(7);
+        s := dpd(6);
+        t := dpd(5);
+        u := dpd(4);
+        v := dpd(3);
+        w := dpd(2);
+        x := dpd(1);
+        y := dpd(0);
+        bcd(11) := (not s and v and w) or (t and v and w and s) or (v and w and not x);
+        bcd(10) := (p and s and x and not t) or (p and not w) or (p and not v);
+        bcd(9)  := (q and s and x and not t) or (q and not w) or (q and not v);
+        bcd(8)  := r;
+        bcd(7)  := (v and not w and x) or (s and v and w and x) or (not t and v and w and x);
+        bcd(6)  := (p and t and v and w and x and not s) or (s and not x and v) or
+                   (s and not v);
+        bcd(5)  := (q and t and w and v and x and not s) or (t and not x and v) or
+                   (t and not v);
+        bcd(4)  := u;
+        bcd(3)  := (t and v and w and x) or (s and v and w and x) or (v and not w and not x);
+        bcd(2)  := (p and not s and not t and w and v) or (s and v and not w and x) or
+                   (p and w and not x and v) or (w and not v);
+        bcd(1)  := (q and not s and not t and v and w) or (t and v and not w and x) or
+                   (q and v and w and not x) or (x and not v);
+        bcd(0)  := y;
+        return bcd;
+    end;
 
 begin
     logical_0: process(all)
         variable rb_adj, tmp : std_ulogic_vector(63 downto 0);
         variable negative : std_ulogic;
+        variable j : integer;
     begin
         -- population counts
         for i in 0 to 31 loop
@@ -81,6 +149,16 @@ begin
             parity(32) <= par1;
         end if;
 
+        -- bit permutation
+        for i in 0 to 7 loop
+            j := i * 8;
+            if rs(j+7 downto j+6) = "00" then
+                permute(i) <= rb(to_integer(unsigned(rs(j+5 downto j))));
+            else
+                permute(i) <= '0';
+            end if;
+        end loop;
+
         rb_adj := rb;
         if invert_in = '1' then
             rb_adj := not rb;
@@ -106,6 +184,19 @@ begin
                 tmp := parity;
             when OP_CMPB =>
                 tmp := ppc_cmpb(rs, rb);
+            when OP_BPERM =>
+                tmp := std_ulogic_vector(resize(unsigned(permute), 64));
+            when OP_BCD =>
+                -- invert_in is abused to indicate direction of conversion
+                if invert_in = '0' then
+                    -- cbcdtd
+                    tmp := x"000" & bcd_to_dpd(rs(55 downto 44)) & bcd_to_dpd(rs(43 downto 32)) &
+                           x"000" & bcd_to_dpd(rs(23 downto 12)) & bcd_to_dpd(rs(11 downto 0));
+                else
+                    -- cdtbcd
+                    tmp := x"00" & dpd_to_bcd(rs(51 downto 42)) & dpd_to_bcd(rs(41 downto 32)) &
+                           x"00" & dpd_to_bcd(rs(19 downto 10)) & dpd_to_bcd(rs(9 downto 0));
+                end if;
             when others =>
                 -- EXTS
                 -- note datalen is a 1-hot encoding
diff --git a/microwatt.core b/microwatt.core
index a2d37df..cd24a06 100644
--- a/microwatt.core
+++ b/microwatt.core
@@ -64,6 +64,8 @@ filesets:
   xilinx_specific:
     files:
       - xilinx-mult.vhdl : {file_type : vhdlSource-2008}
+      - fpga/fpga-random.vhdl : {file_type : vhdlSource-2008}
+      - fpga/fpga-random.xdc : {file_type : xdc}
 
   debug_xilinx:
     files:
diff --git a/multiply.vhdl b/multiply.vhdl
index 7a4c81b..a7ca7ac 100644
--- a/multiply.vhdl
+++ b/multiply.vhdl
@@ -12,22 +12,22 @@ entity multiply is
     port (
         clk   : in std_logic;
 
-        m_in  : in Execute1ToMultiplyType;
-        m_out : out MultiplyToExecute1Type
+        m_in  : in MultiplyInputType;
+        m_out : out MultiplyOutputType
         );
 end entity multiply;
 
 architecture behaviour of multiply is
-    signal m: Execute1ToMultiplyType := Execute1ToMultiplyInit;
+    signal m: MultiplyInputType := MultiplyInputInit;
 
     type multiply_pipeline_stage is record
         valid     : std_ulogic;
         data      : unsigned(127 downto 0);
 	is_32bit  : std_ulogic;
-        neg_res   : std_ulogic;
+        not_res   : std_ulogic;
     end record;
     constant MultiplyPipelineStageInit : multiply_pipeline_stage := (valid => '0',
-								     is_32bit => '0', neg_res => '0',
+								     is_32bit => '0', not_res => '0',
 								     data => (others => '0'));
 
     type multiply_pipeline_type is array(0 to PIPELINE_DEPTH-1) of multiply_pipeline_stage;
@@ -38,12 +38,15 @@ architecture behaviour of multiply is
     end record;
 
     signal r, rin : reg_type := (multiply_pipeline => MultiplyPipelineInit);
+    signal overflow : std_ulogic;
+    signal ovf_in   : std_ulogic;
 begin
     multiply_0: process(clk)
     begin
         if rising_edge(clk) then
             m <= m_in;
             r <= rin;
+            overflow <= ovf_in;
         end if;
     end process;
 
@@ -53,19 +56,19 @@ begin
         variable d2 : std_ulogic_vector(63 downto 0);
 	variable ov : std_ulogic;
     begin
+        v := r;
         v.multiply_pipeline(0).valid := m.valid;
-        v.multiply_pipeline(0).data := unsigned(m.data1) * unsigned(m.data2);
+        v.multiply_pipeline(0).data := (unsigned(m.data1) * unsigned(m.data2)) + unsigned(m.addend);
         v.multiply_pipeline(0).is_32bit := m.is_32bit;
-        v.multiply_pipeline(0).neg_res := m.neg_result;
+        v.multiply_pipeline(0).not_res := m.not_result;
 
         loop_0: for i in 1 to PIPELINE_DEPTH-1 loop
             v.multiply_pipeline(i) := r.multiply_pipeline(i-1);
         end loop;
 
-        if v.multiply_pipeline(PIPELINE_DEPTH-1).neg_res = '0' then
-            d := std_ulogic_vector(v.multiply_pipeline(PIPELINE_DEPTH-1).data);
-        else
-            d := std_ulogic_vector(- signed(v.multiply_pipeline(PIPELINE_DEPTH-1).data));
+        d := std_ulogic_vector(v.multiply_pipeline(PIPELINE_DEPTH-1).data);
+        if v.multiply_pipeline(PIPELINE_DEPTH-1).not_res = '1' then
+            d := not d;
         end if;
 
         ov := '0';
@@ -74,9 +77,10 @@ begin
         else
             ov := (or d(127 downto 63)) and not (and d(127 downto 63));
         end if;
+        ovf_in <= ov;
 
         m_out.result <= d;
-        m_out.overflow <= ov;
+        m_out.overflow <= overflow;
         m_out.valid <= v.multiply_pipeline(PIPELINE_DEPTH-1).valid;
 
         rin <= v;
diff --git a/multiply_tb.vhdl b/multiply_tb.vhdl
index 87f029d..884b828 100644
--- a/multiply_tb.vhdl
+++ b/multiply_tb.vhdl
@@ -17,8 +17,8 @@ architecture behave of multiply_tb is
 
     constant pipeline_depth : integer := 4;
 
-    signal m1               : Execute1ToMultiplyType := Execute1ToMultiplyInit;
-    signal m2               : MultiplyToExecute1Type;
+    signal m1               : MultiplyInputType := MultiplyInputInit;
+    signal m2               : MultiplyOutputType;
 
     function absval(x: std_ulogic_vector) return std_ulogic_vector is
     begin
@@ -45,6 +45,7 @@ begin
     stim_process: process
         variable ra, rb, rt, behave_rt: std_ulogic_vector(63 downto 0);
         variable si: std_ulogic_vector(15 downto 0);
+        variable sign: std_ulogic;
     begin
         wait for clk_period;
 
@@ -90,7 +91,9 @@ begin
 
             m1.data1 <= absval(ra);
             m1.data2 <= absval(rb);
-            m1.neg_result <= ra(63) xor rb(63);
+            sign := ra(63) xor rb(63);
+            m1.not_result <= sign;
+            m1.addend <= (others => sign);
             m1.valid <= '1';
 
             wait for clk_period;
@@ -114,7 +117,8 @@ begin
 
             m1.data1 <= ra;
             m1.data2 <= rb;
-            m1.neg_result <= '0';
+            m1.not_result <= '0';
+            m1.addend <= (others => '0');
             m1.valid <= '1';
 
             wait for clk_period;
@@ -138,7 +142,9 @@ begin
 
             m1.data1 <= absval(ra);
             m1.data2 <= absval(rb);
-            m1.neg_result <= ra(63) xor rb(63);
+            sign := ra(63) xor rb(63);
+            m1.not_result <= sign;
+            m1.addend <= (others => sign);
             m1.valid <= '1';
 
             wait for clk_period;
@@ -164,7 +170,9 @@ begin
             m1.data1(31 downto 0) <= absval(ra(31 downto 0));
             m1.data2 <= (others => '0');
             m1.data2(31 downto 0) <= absval(rb(31 downto 0));
-            m1.neg_result <= ra(31) xor rb(31);
+            sign := ra(31) xor rb(31);
+            m1.not_result <= sign;
+            m1.addend <= (others => sign);
             m1.valid <= '1';
 
             wait for clk_period;
@@ -190,7 +198,9 @@ begin
             m1.data1(31 downto 0) <= absval(ra(31 downto 0));
             m1.data2 <= (others => '0');
             m1.data2(31 downto 0) <= absval(rb(31 downto 0));
-            m1.neg_result <= ra(31) xor rb(31);
+            sign := ra(31) xor rb(31);
+            m1.not_result <= sign;
+            m1.addend <= (others => sign);
             m1.valid <= '1';
 
             wait for clk_period;
@@ -217,7 +227,8 @@ begin
             m1.data1(31 downto 0) <= ra(31 downto 0);
             m1.data2 <= (others => '0');
             m1.data2(31 downto 0) <= rb(31 downto 0);
-            m1.neg_result <= '0';
+            m1.not_result <= '0';
+            m1.addend <= (others => '0');
             m1.valid <= '1';
 
             wait for clk_period;
@@ -243,7 +254,9 @@ begin
             m1.data1 <= absval(ra);
             m1.data2 <= (others => '0');
             m1.data2(15 downto 0) <= absval(si);
-            m1.neg_result <= ra(63) xor si(15);
+            sign := ra(63) xor si(15);
+            m1.not_result <= sign;
+            m1.addend <= (others => sign);
             m1.valid <= '1';
 
             wait for clk_period;
diff --git a/nonrandom.vhdl b/nonrandom.vhdl
new file mode 100644
index 0000000..16f81da
--- /dev/null
+++ b/nonrandom.vhdl
@@ -0,0 +1,22 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+
+entity random is
+    port (
+        clk  : in std_ulogic;
+        data : out std_ulogic_vector(63 downto 0);
+        raw  : out std_ulogic_vector(63 downto 0);
+        err  : out std_ulogic
+        );
+end entity random;
+
+architecture behaviour of random is
+
+begin
+    data <= (others => '1');
+    raw <= (others => '1');
+    err <= '1';
+end behaviour;
diff --git a/ppc_fx_insns.vhdl b/ppc_fx_insns.vhdl
index 5fdf1c7..c34a884 100644
--- a/ppc_fx_insns.vhdl
+++ b/ppc_fx_insns.vhdl
@@ -87,6 +87,8 @@ package ppc_fx_insns is
                            so: std_ulogic) return std_ulogic_vector;
 
 	function ppc_cmpb (rs, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector;
+        function ppc_cmpeqb (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector;
+        function ppc_cmprb (ra, rb: std_ulogic_vector(63 downto 0); l: std_ulogic) return std_ulogic_vector;
 
 	function ppc_divw (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector;
 	function ppc_divdu (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector;
@@ -746,6 +748,34 @@ package body ppc_fx_insns is
 		return ret;
 	end;
 
+        function ppc_cmpeqb (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
+            variable match: std_ulogic;
+            variable j: integer;
+        begin
+            match := '0';
+            for i in 0 to 7 loop
+                j := i * 8;
+                if ra(7 downto 0) = rb(j + 7 downto j) then
+                    match := '1';
+                end if;
+            end loop;
+            return '0' & match & "00";
+        end;
+
+        function ppc_cmprb (ra, rb: std_ulogic_vector(63 downto 0); l: std_ulogic) return std_ulogic_vector is
+            variable match: std_ulogic;
+            variable v: unsigned(7 downto 0);
+        begin
+            match := '0';
+            v := unsigned(ra(7 downto 0));
+            if v >= unsigned(rb(7 downto 0)) and v <= unsigned(rb(15 downto 8)) then
+                match := '1';
+            elsif l = '1' and v >= unsigned(rb(23 downto 16)) and v <= unsigned(rb(31 downto 24)) then
+                match := '1';
+            end if;
+            return '0' & match & "00";
+        end;
+
 	-- Not synthesizable
 	function ppc_divw (ra, rb: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
 		variable tmp: signed(31 downto 0);
diff --git a/random.vhdl b/random.vhdl
new file mode 100644
index 0000000..063c30e
--- /dev/null
+++ b/random.vhdl
@@ -0,0 +1,30 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.glibc_random.all;
+
+entity random is
+    port (
+        clk  : in std_ulogic;
+        data : out std_ulogic_vector(63 downto 0);
+        raw  : out std_ulogic_vector(63 downto 0);
+        err  : out std_ulogic
+        );
+end entity random;
+
+architecture behaviour of random is
+begin
+    err <= '0';
+
+    process(clk)
+        variable rand : std_ulogic_vector(63 downto 0);
+    begin
+        if rising_edge(clk) then
+            rand := pseudorand(64);
+            data <= rand;
+            raw <= rand;
+        end if;
+    end process;
+end behaviour;
diff --git a/scripts/fmt_log/fmt_log.c b/scripts/fmt_log/fmt_log.c
index c8fb501..146346d 100644
--- a/scripts/fmt_log/fmt_log.c
+++ b/scripts/fmt_log/fmt_log.c
@@ -90,11 +90,11 @@ const char *ops[64] =
 	"illegal", "nop    ", "add    ", "and    ", "attn   ", "b      ", "bc     ", "bcreg  ",
 	"bperm  ", "cmp    ", "cmpb   ", "cmpeqb ", "cmprb  ", "cntz   ", "crop   ", "darn   ",
 	"dcbf   ", "dcbst  ", "dcbt   ", "dcbtst ", "dcbz   ", "div    ", "dive   ", "exts   ",
-	"extswsl", "icbi   ", "icbt   ", "isel   ", "isync  ", "ld     ", "st     ", "maddhd ",
-	"maddhdu", "maddld ", "mcrxr  ", "mcrxrx ", "mfcr   ", "mfmsr  ", "mfspr  ", "mod    ",
-	"mtcrf  ", "mtmsr  ", "mtspr  ", "mull64 ", "mulh64 ", "mulh32 ", "or     ", "popcnt ",
-	"prty   ", "rfid   ", "rlc    ", "rlcl   ", "rlcr   ", "sc     ", "setb   ", "shl    ",
-	"shr    ", "sync   ", "tlbie  ", "trap   ", "xor    ", "ffail  ", "?62    ", "?63    "
+	"extswsl", "icbi   ", "icbt   ", "isel   ", "isync  ", "ld     ", "st     ", "mcrxrx ",
+	"mfcr   ", "mfmsr  ", "mfspr  ", "mod    ", "mtcrf  ", "mtmsr  ", "mtspr  ", "mull64 ",
+	"mulh64 ", "mulh32 ", "or     ", "popcnt ", "prty   ", "rfid   ", "rlc    ", "rlcl   ",
+	"rlcr   ", "sc     ", "setb   ", "shl    ", "shr    ", "sync   ", "tlbie  ", "trap   ",
+	"xor    ", "bcd    ", "addg6s ", "ffail  ", "?60    ", "?61    ", "?62    ", "?63    "
 };
 
 const char *spr_names[13] =
diff --git a/xilinx-mult.vhdl b/xilinx-mult.vhdl
index 46366d6..22d73c7 100644
--- a/xilinx-mult.vhdl
+++ b/xilinx-mult.vhdl
@@ -12,8 +12,8 @@ entity multiply is
     port (
         clk   : in std_logic;
 
-        m_in  : in Execute1ToMultiplyType;
-        m_out : out MultiplyToExecute1Type
+        m_in  : in MultiplyInputType;
+        m_out : out MultiplyOutputType
         );
 end entity multiply;
 
@@ -33,11 +33,12 @@ architecture behaviour of multiply is
     signal p1_pat, p1_patb : std_ulogic;
 
     signal req_32bit, r32_1 : std_ulogic;
-    signal req_neg, rneg_1 : std_ulogic;
+    signal req_not, rnot_1 : std_ulogic;
     signal valid_1 : std_ulogic;
+    signal overflow, ovf_in : std_ulogic;
 
 begin
-    addend <= (others => m_in.neg_result);
+    addend <= m_in.addend;
 
     m00: DSP48E1
         generic map (
@@ -73,7 +74,7 @@ begin
             CECTRL => '0',
             CED => '0',
             CEINMODE => '0',
-            CEM => '1',
+            CEM => m_in.valid,
             CEP => '0',
             CLK => clk,
             D => (others => '0'),
@@ -129,7 +130,7 @@ begin
             CECTRL => '0',
             CED => '0',
             CEINMODE => '0',
-            CEM => '1',
+            CEM => m_in.valid,
             CEP => '0',
             CLK => clk,
             D => (others => '0'),
@@ -184,7 +185,7 @@ begin
             CECTRL => '0',
             CED => '0',
             CEINMODE => '0',
-            CEM => '1',
+            CEM => m_in.valid,
             CEP => '0',
             CLK => clk,
             D => (others => '0'),
@@ -239,7 +240,7 @@ begin
             CECTRL => '0',
             CED => '0',
             CEINMODE => '0',
-            CEM => '1',
+            CEM => m_in.valid,
             CEP => '0',
             CLK => clk,
             D => (others => '0'),
@@ -295,7 +296,7 @@ begin
             CECTRL => '0',
             CED => '0',
             CEINMODE => '0',
-            CEM => '1',
+            CEM => m_in.valid,
             CEP => '0',
             CLK => clk,
             D => (others => '0'),
@@ -351,7 +352,7 @@ begin
             CECTRL => '0',
             CED => '0',
             CEINMODE => '0',
-            CEM => '1',
+            CEM => m_in.valid,
             CEP => '0',
             CLK => clk,
             D => (others => '0'),
@@ -408,7 +409,7 @@ begin
             CECTRL => '0',
             CED => '0',
             CEINMODE => '0',
-            CEM => '1',
+            CEM => m_in.valid,
             CEP => '0',
             CLK => clk,
             D => (others => '0'),
@@ -464,7 +465,7 @@ begin
             CECTRL => '0',
             CED => '0',
             CEINMODE => '0',
-            CEM => '1',
+            CEM => m_in.valid,
             CEP => '0',
             CLK => clk,
             D => (others => '0'),
@@ -520,7 +521,7 @@ begin
             CECTRL => '0',
             CED => '0',
             CEINMODE => '0',
-            CEM => '1',
+            CEM => m_in.valid,
             CEP => '0',
             CLK => clk,
             D => (others => '0'),
@@ -575,7 +576,7 @@ begin
             CECTRL => '0',
             CED => '0',
             CEINMODE => '0',
-            CEM => '1',
+            CEM => m_in.valid,
             CEP => '0',
             CLK => clk,
             D => (others => '0'),
@@ -630,7 +631,7 @@ begin
             CECTRL => '0',
             CED => '0',
             CEINMODE => '0',
-            CEM => '1',
+            CEM => m_in.valid,
             CEP => '0',
             CLK => clk,
             D => (others => '0'),
@@ -685,7 +686,7 @@ begin
             CECTRL => '0',
             CED => '0',
             CEINMODE => '0',
-            CEM => '1',
+            CEM => m_in.valid,
             CEP => '0',
             CLK => clk,
             D => (others => '0'),
@@ -734,12 +735,12 @@ begin
             CARRYINSEL => "000",
             CARRYOUT => s0_carry,
             CEA1 => '0',
-            CEA2 => '1',
+            CEA2 => valid_1,
             CEAD => '0',
             CEALUMODE => '0',
             CEB1 => '0',
-            CEB2 => '1',
-            CEC => '1',
+            CEB2 => valid_1,
+            CEC => valid_1,
             CECARRYIN => '0',
             CECTRL => '0',
             CED => '0',
@@ -792,12 +793,12 @@ begin
             CARRYIN => s0_carry(3),
             CARRYINSEL => "000",
             CEA1 => '0',
-            CEA2 => '1',
+            CEA2 => valid_1,
             CEAD => '0',
             CEALUMODE => '0',
             CEB1 => '0',
-            CEB2 => '1',
-            CEC => '1',
+            CEB2 => valid_1,
+            CEC => valid_1,
             CECARRYIN => '0',
             CECTRL => '0',
             CED => '0',
@@ -848,7 +849,7 @@ begin
         port map (
             A => m21_p(22 downto 0) & m03_p(5 downto 0) & '0',
             ACIN => (others => '0'),
-            ALUMODE => "00" & rneg_1 & '0',
+            ALUMODE => "00" & rnot_1 & '0',
             B => (others => '0'),
             BCIN => (others => '0'),
             C => p0_mask,
@@ -857,12 +858,12 @@ begin
             CARRYINSEL => "000",
             CARRYOUT => p0_carry,
             CEA1 => '0',
-            CEA2 => '1',
+            CEA2 => valid_1,
             CEAD => '0',
-            CEALUMODE => '1',
+            CEALUMODE => valid_1,
             CEB1 => '0',
-            CEB2 => '1',
-            CEC => '1',
+            CEB2 => valid_1,
+            CEC => valid_1,
             CECARRYIN => '0',
             CECTRL => '0',
             CED => '0',
@@ -911,7 +912,7 @@ begin
         port map (
             A => x"0000000" & '0' & m21_p(41),
             ACIN => (others => '0'),
-            ALUMODE => "00" & rneg_1 & '0',
+            ALUMODE => "00" & rnot_1 & '0',
             B => m21_p(40 downto 23),
             BCIN => (others => '0'),
             C => (others => '0'),
@@ -919,11 +920,11 @@ begin
             CARRYIN => p0_carry(3),
             CARRYINSEL => "000",
             CEA1 => '0',
-            CEA2 => '1',
+            CEA2 => valid_1,
             CEAD => '0',
-            CEALUMODE => '1',
+            CEALUMODE => valid_1,
             CEB1 => '0',
-            CEB2 => '1',
+            CEB2 => valid_1,
             CEC => '0',
             CECARRYIN => '0',
             CECTRL => '0',
@@ -952,7 +953,7 @@ begin
             RSTP => '0'
             );
 
-    product(31 downto 0) <= product_lo xor (31 downto 0 => req_neg);
+    product(31 downto 0) <= product_lo xor (31 downto 0 => req_not);
 
     mult_out: process(all)
         variable ov : std_ulogic;
@@ -964,9 +965,10 @@ begin
             ov := not ((p1_pat and p0_pat and not product(31)) or
                        (p1_patb and p0_patb and product(31)));
         end if;
+        ovf_in <= ov;
 
         m_out.result <= product;
-        m_out.overflow <= ov;
+        m_out.overflow <= overflow;
     end process;
 
     process(clk)
@@ -977,8 +979,9 @@ begin
             valid_1 <= m_in.valid;
             req_32bit <= r32_1;
             r32_1 <= m_in.is_32bit;
-            req_neg <= rneg_1;
-            rneg_1 <= m_in.neg_result;
+            req_not <= rnot_1;
+            rnot_1 <= m_in.not_result;
+            overflow <= ovf_in;
         end if;
     end process;