diff --git a/Makefile b/Makefile
index fb591a4..01eab73 100644
--- a/Makefile
+++ b/Makefile
@@ -74,7 +74,7 @@ core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \
 	cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \
 	logical.vhdl countbits.vhdl multiply.vhdl multiply-32s.vhdl divider.vhdl \
 	execute1.vhdl loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl \
-	core_debug.vhdl core.vhdl fpu.vhdl pmu.vhdl
+	core_debug.vhdl core.vhdl fpu.vhdl pmu.vhdl bitsort.vhdl
 
 soc_files = wishbone_arbiter.vhdl wishbone_bram_wrapper.vhdl sync_fifo.vhdl \
 	wishbone_debug_master.vhdl xics.vhdl syscon.vhdl gpio.vhdl soc.vhdl \
diff --git a/bitsort.vhdl b/bitsort.vhdl
new file mode 100644
index 0000000..f2aeddb
--- /dev/null
+++ b/bitsort.vhdl
@@ -0,0 +1,102 @@
+-- Implements instructions that involve sorting bits,
+-- that is, cfuged, pextd and pdepd.
+--
+-- cfuged: Sort the bits in the mask in RB into 0s at the left, 1s at the right
+--         and move the bits in RS in the same fashion to give the result
+-- pextd:  Like cfuged but the only use the bits of RS where the
+--         corresponding bit in RB is 1
+-- pdepd:  Inverse of pextd; take the low-order bits of RS and spread them out
+--         to the bit positions which have a 1 in RB
+
+-- NB opc is bits 7-6 of the instruction:
+-- 00 = pdepd, 01 = pextd, 10 = cfuged
+
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.helpers.all;
+
+entity bit_sorter is
+    port (
+        clk         : in std_ulogic;
+        rst         : in std_ulogic;
+        rs          : in std_ulogic_vector(63 downto 0);
+        rb          : in std_ulogic_vector(63 downto 0);
+        go          : in std_ulogic;
+        opc         : in std_ulogic_vector(1 downto 0);
+        done        : out std_ulogic;
+        result      : out std_ulogic_vector(63 downto 0)
+        );
+end entity bit_sorter;
+
+architecture behaviour of bit_sorter is
+
+    signal val : std_ulogic_vector(63 downto 0);
+    signal st  : std_ulogic;
+    signal sd  : std_ulogic;
+    signal opr : std_ulogic_vector(1 downto 0);
+    signal bc  : unsigned(5 downto 0);
+    signal jl  : unsigned(5 downto 0);
+    signal jr  : unsigned(5 downto 0);
+    signal sr_ml : std_ulogic_vector(63 downto 0);
+    signal sr_mr : std_ulogic_vector(63 downto 0);
+    signal sr_vl : std_ulogic_vector(63 downto 0);
+    signal sr_vr : std_ulogic_vector(63 downto 0);
+
+begin
+    bsort_r: process(clk)
+    begin
+        if rising_edge(clk) then
+            sd <= '0';
+            if rst = '1' then
+                st <= '0';
+                opr <= "00";
+                val <= (others => '0');
+            elsif go = '1' then
+                st <= '1';
+                sr_ml <= rb;
+                sr_mr <= rb;
+                sr_vl <= rs;
+                sr_vr <= rs;
+                opr <= opc;
+                val <= (others => '0');
+                bc <= to_unsigned(0, 6);
+                jl <= to_unsigned(63, 6);
+                jr <= to_unsigned(0, 6);
+            elsif st = '1' then
+                if bc = 6x"3f" then
+                    st <= '0';
+                    sd <= '1';
+                end if;
+                bc <= bc + 1;
+                if sr_ml(63) = '0' and opr(1) = '1' then
+                    -- cfuged
+                    val(to_integer(jl)) <= sr_vl(63);
+                    jl <= jl - 1;
+                end if;
+                if sr_mr(0) = '1' then
+                    if opr = "00" then
+                        -- pdepd
+                        val(to_integer(bc)) <= sr_vr(0);
+                    else
+                        -- cfuged or pextd
+                        val(to_integer(jr)) <= sr_vr(0);
+                    end if;
+                    jr <= jr + 1;
+                end if;
+                sr_vl <= sr_vl(62 downto 0) & '0';
+                if opr /= "00" or sr_mr(0) = '1' then
+                    sr_vr <= '0' & sr_vr(63 downto 1);
+                end if;
+                sr_ml <= sr_ml(62 downto 0) & '0';
+                sr_mr <= '0' & sr_mr(63 downto 1);
+            end if;
+        end if;
+    end process;
+
+    done <= sd;
+    result <= val;
+
+end behaviour;
diff --git a/decode1.vhdl b/decode1.vhdl
index 75bb9c3..86fb5cf 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -106,6 +106,7 @@ architecture behaviour of decode1 is
         INSN_brd         =>  (ALU,  NONE, OP_BREV,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         INSN_cbcdtd      =>  (ALU,  NONE, OP_BCD,       NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         INSN_cdtbcd      =>  (ALU,  NONE, OP_BCD,       NONE,       NONE,        RS,   RA,   '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
+        INSN_cfuged      =>  (ALU,  NONE, OP_BSORT,     NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         INSN_cmp         =>  (ALU,  NONE, OP_CMP,       RA,         RB,          NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE),
         INSN_cmpb        =>  (ALU,  NONE, OP_CMPB,      NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         INSN_cmpeqb      =>  (ALU,  NONE, OP_CMPEQB,    RA,         RB,          NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
@@ -113,10 +114,10 @@ architecture behaviour of decode1 is
         INSN_cmpl        =>  (ALU,  NONE, OP_CMP,       RA,         RB,          NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         INSN_cmpli       =>  (ALU,  NONE, OP_CMP,       RA,         CONST_UI,    NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         INSN_cmprb       =>  (ALU,  NONE, OP_CMPRB,     RA,         RB,          NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
-        INSN_cntlzd      =>  (ALU,  NONE, OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE),
-        INSN_cntlzw      =>  (ALU,  NONE, OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE),
-        INSN_cnttzd      =>  (ALU,  NONE, OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE),
-        INSN_cnttzw      =>  (ALU,  NONE, OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE),
+        INSN_cntlzd      =>  (ALU,  NONE, OP_COUNTB,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE),
+        INSN_cntlzw      =>  (ALU,  NONE, OP_COUNTB,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE),
+        INSN_cnttzd      =>  (ALU,  NONE, OP_COUNTB,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0', NONE),
+        INSN_cnttzw      =>  (ALU,  NONE, OP_COUNTB,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0', NONE),
         INSN_crand       =>  (ALU,  NONE, OP_CROP,      NONE,       NONE,        NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         INSN_crandc      =>  (ALU,  NONE, OP_CROP,      NONE,       NONE,        NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         INSN_creqv       =>  (ALU,  NONE, OP_CROP,      NONE,       NONE,        NONE, NONE, '1', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
@@ -281,6 +282,8 @@ architecture behaviour of decode1 is
         INSN_ori         =>  (ALU,  NONE, OP_LOGIC,     NONE,       CONST_UI,    RS,   RA,   '0', '0', '1', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE),
         INSN_oris        =>  (ALU,  NONE, OP_LOGIC,     NONE,       CONST_UI_HI, RS,   RA,   '0', '0', '1', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE),
         INSN_paddi       =>  (ALU,  NONE, OP_ADD,       RA0_OR_CIA, CONST_PSI,   NONE, RT,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
+        INSN_pdepd       =>  (ALU,  NONE, OP_BSORT,     NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
+        INSN_pextd       =>  (ALU,  NONE, OP_BSORT,     NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         INSN_plbz        =>  (LDST, NONE, OP_LOAD,      RA0_OR_CIA, CONST_PSI,   NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         INSN_pld         =>  (LDST, NONE, OP_LOAD,      RA0_OR_CIA, CONST_PSI,   NONE, RT,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         INSN_plfd        =>  (LDST, FPU,  OP_LOAD,      RA0_OR_CIA, CONST_PSI,   NONE, FRT,  '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
@@ -296,9 +299,9 @@ architecture behaviour of decode1 is
         INSN_pstfs       =>  (LDST, FPU,  OP_STORE,     RA0_OR_CIA, CONST_PSI,   FRS,  NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE),
         INSN_psth        =>  (LDST, NONE, OP_STORE,     RA0_OR_CIA, CONST_PSI,   RS,   NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         INSN_pstw        =>  (LDST, NONE, OP_STORE,     RA0_OR_CIA, CONST_PSI,   RS,   NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
-        INSN_popcntb     =>  (ALU,  NONE, OP_POPCNT,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
-        INSN_popcntd     =>  (ALU,  NONE, OP_POPCNT,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
-        INSN_popcntw     =>  (ALU,  NONE, OP_POPCNT,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
+        INSN_popcntb     =>  (ALU,  NONE, OP_COUNTB,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
+        INSN_popcntd     =>  (ALU,  NONE, OP_COUNTB,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
+        INSN_popcntw     =>  (ALU,  NONE, OP_COUNTB,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         INSN_prtyd       =>  (ALU,  NONE, OP_PRTY,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         INSN_prtyw       =>  (ALU,  NONE, OP_PRTY,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
         INSN_rfid        =>  (ALU,  NONE, OP_RFID,      NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE),
diff --git a/decode2.vhdl b/decode2.vhdl
index 94fb6a7..a747495 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -232,12 +232,13 @@ architecture behaviour of decode2 is
         );
 
     constant subresult_select : mux_select_array_t := (
-        OP_MUL_L64 => "000",            -- muldiv_result
-        OP_MUL_H64 => "001",
-        OP_MUL_H32 => "010",
-        OP_DIV     => "011",
-        OP_DIVE    => "011",
-        OP_MOD     => "011",
+        OP_MUL_L64 => "000",            -- multicyc_result
+        OP_MUL_H64 => "010",
+        OP_MUL_H32 => "001",
+        OP_DIV     => "101",
+        OP_DIVE    => "101",
+        OP_MOD     => "101",
+        OP_BSORT   => "100",
         OP_ADDG6S  => "001",            -- misc_result
         OP_ISEL    => "010",
         OP_DARN    => "011",
diff --git a/decode_types.vhdl b/decode_types.vhdl
index 4f81a36..dc104cd 100644
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@@ -6,7 +6,7 @@ package decode_types is
 			 OP_ATTN, OP_B, OP_BC, OP_BCREG,
 			 OP_BCD, OP_BPERM, OP_BREV,
                          OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB,
-			 OP_CNTZ, OP_CROP,
+			 OP_COUNTB, OP_CROP,
 			 OP_DARN, OP_DCBF, OP_DCBST, OP_XCBT, OP_DCBTST,
 			 OP_DCBZ, OP_ICBI,
                          OP_FP_CMP, OP_FP_ARITH, OP_FP_MOVE, OP_FP_MISC,
@@ -18,7 +18,8 @@ package decode_types is
 			 OP_MCRXRX, OP_MFCR, OP_MFMSR, OP_MFSPR,
 			 OP_MTCRF, OP_MTMSRD, OP_MTSPR, OP_MUL_L64,
 			 OP_MUL_H64, OP_MUL_H32,
-			 OP_POPCNT, OP_PRTY, OP_RFID,
+                         OP_BSORT,
+			 OP_PRTY, OP_RFID,
 			 OP_RLC, OP_RLCL, OP_RLCR, OP_SC, OP_SETB,
 			 OP_SHL, OP_SHR,
 			 OP_SYNC, OP_TLBIE, OP_TRAP,
@@ -179,11 +180,12 @@ package decode_types is
         INSN_and,
         INSN_andc,
         INSN_bperm,
+        INSN_cfuged,
         INSN_cmp,
         INSN_cmpb,
         INSN_cmpeqb,
-        INSN_cmpl,
-        INSN_cmprb, -- 140
+        INSN_cmpl, -- 140
+        INSN_cmprb,
         INSN_dcbf,
         INSN_dcbst,
         INSN_dcbt,
@@ -192,8 +194,8 @@ package decode_types is
         INSN_divd,
         INSN_divdu,
         INSN_divde,
-        INSN_divdeu,
-        INSN_divw, -- 150
+        INSN_divdeu, -- 150
+        INSN_divw,
         INSN_divwu,
         INSN_divwe,
         INSN_divweu,
@@ -202,8 +204,8 @@ package decode_types is
         INSN_icbt,
         INSN_isel,
         INSN_lbarx,
-        INSN_lbzcix,
-        INSN_lbzux, -- 160
+        INSN_lbzcix, -- 160
+        INSN_lbzux,
         INSN_lbzx,
         INSN_ldarx,
         INSN_ldbrx,
@@ -212,8 +214,8 @@ package decode_types is
         INSN_ldux,
         INSN_lharx,
         INSN_lhax,
-        INSN_lhaux,
-        INSN_lhbrx, -- 170
+        INSN_lhaux, -- 170
+        INSN_lhbrx,
         INSN_lhzcix,
         INSN_lhzx,
         INSN_lhzux,
@@ -222,8 +224,8 @@ package decode_types is
         INSN_lwaux,
         INSN_lwbrx,
         INSN_lwzcix,
-        INSN_lwzx,
-        INSN_lwzux, -- 180
+        INSN_lwzx, -- 180
+        INSN_lwzux,
         INSN_modsd,
         INSN_modsw,
         INSN_moduw,
@@ -232,51 +234,54 @@ package decode_types is
         INSN_mulhwu,
         INSN_mulhd,
         INSN_mulhdu,
-        INSN_mullw,
-        INSN_mulld, -- 190
+        INSN_mullw, -- 190
+        INSN_mulld,
         INSN_nand,
         INSN_nor,
         INSN_or,
         INSN_orc,
+        INSN_pdepd,
+        INSN_pextd,
         INSN_rldcl,
         INSN_rldcr,
-        INSN_rlwnm,
+        INSN_rlwnm, -- 200
         INSN_slw,
         INSN_sld,
-        INSN_sraw, -- 200
+        INSN_sraw,
         INSN_srad,
         INSN_srw,
         INSN_srd,
         INSN_stbcix,
         INSN_stbcx,
         INSN_stbx,
-        INSN_stbux,
+        INSN_stbux, -- 210
         INSN_stdbrx,
         INSN_stdcix,
-        INSN_stdcx, -- 210
+        INSN_stdcx,
         INSN_stdx,
         INSN_stdux,
         INSN_sthbrx,
         INSN_sthcix,
         INSN_sthcx,
         INSN_sthx,
-        INSN_sthux,
+        INSN_sthux, -- 220
         INSN_stwbrx,
         INSN_stwcix,
-        INSN_stwcx, -- 220
+        INSN_stwcx,
         INSN_stwx,
         INSN_stwux,
         INSN_subf,
         INSN_subfc,
         INSN_subfe,
         INSN_td,
-        INSN_tlbie,
+        INSN_tlbie, -- 230
         INSN_tlbiel,
         INSN_tw,
-        INSN_xor, -- 230
+        INSN_xor,
 
-        -- pad to 232 to simplify comparison logic
-        INSN_231,
+        -- pad to 240 to simplify comparison logic
+        INSN_234, INSN_235,
+        INSN_236, INSN_237, INSN_238, INSN_239,
 
         -- The following instructions have a third input addressed by RC
         INSN_maddld,
@@ -284,9 +289,7 @@ package decode_types is
         INSN_maddhdu,
 
         -- pad to 256 to simplify comparison logic
-        INSN_235,
-        INSN_236, INSN_237, INSN_238, INSN_239,
-        INSN_240, INSN_241, INSN_242, INSN_243,
+        INSN_243,
         INSN_244, INSN_245, INSN_246, INSN_247,
         INSN_248, INSN_249, INSN_250, INSN_251,
         INSN_252, INSN_253, INSN_254, INSN_255,
diff --git a/execute1.vhdl b/execute1.vhdl
index 9b55195..2cc9c35 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -113,6 +113,7 @@ architecture behaviour of execute1 is
         direct_branch : std_ulogic;
         start_mul : std_ulogic;
         start_div : std_ulogic;
+        start_bsort : std_ulogic;
         do_trace : std_ulogic;
         fp_intr : std_ulogic;
         res2_sel : std_ulogic_vector(1 downto 0);
@@ -134,7 +135,7 @@ architecture behaviour of execute1 is
         prev_op : insn_type_t;
         prev_prefixed : std_ulogic;
         oe : std_ulogic;
-        mul_select : std_ulogic_vector(1 downto 0);
+        mul_select : std_ulogic_vector(2 downto 0);
         res2_sel : std_ulogic_vector(1 downto 0);
         spr_select : spr_id;
         pmu_spr_num : std_ulogic_vector(4 downto 0);
@@ -144,6 +145,7 @@ architecture behaviour of execute1 is
 	mul_in_progress : std_ulogic;
         mul_finish : std_ulogic;
         div_in_progress : std_ulogic;
+        bsort_in_progress : std_ulogic;
         no_instr_avail : std_ulogic;
         instr_dispatch : std_ulogic;
         ext_interrupt : std_ulogic;
@@ -164,10 +166,11 @@ architecture behaviour of execute1 is
          busy => '0',
          fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL,
          prev_prefixed => '0',
-         oe => '0', mul_select => "00", res2_sel => "00",
+         oe => '0', mul_select => "000", res2_sel => "00",
          spr_select => spr_id_init, pmu_spr_num => 5x"0",
          redir_to_next => '0', advance_nia => '0', lr_from_next => '0',
          mul_in_progress => '0', mul_finish => '0', div_in_progress => '0',
+         bsort_in_progress => '0',
          no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0',
          taken_branch_event => '0', br_mispredict => '0',
          msr => 64x"0",
@@ -209,7 +212,8 @@ architecture behaviour of execute1 is
     signal alu_result: std_ulogic_vector(63 downto 0);
     signal adder_result: std_ulogic_vector(63 downto 0);
     signal misc_result: std_ulogic_vector(63 downto 0);
-    signal muldiv_result: std_ulogic_vector(63 downto 0);
+    signal multicyc_result: std_ulogic_vector(63 downto 0);
+    signal bsort_result: std_ulogic_vector(63 downto 0);
     signal spr_result: std_ulogic_vector(63 downto 0);
     signal next_nia : std_ulogic_vector(63 downto 0);
     signal s1_sel : std_ulogic_vector(2 downto 0);
@@ -234,6 +238,10 @@ architecture behaviour of execute1 is
     signal x_to_divider: Execute1ToDividerType;
     signal divider_to_x: DividerToExecute1Type := DividerToExecute1Init;
 
+    -- bit-sort unit signals
+    signal bsort_start : std_ulogic;
+    signal bsort_done  : std_ulogic;
+
     -- random number generator signals
     signal random_raw  : std_ulogic_vector(63 downto 0);
     signal random_cond : std_ulogic_vector(63 downto 0);
@@ -493,6 +501,18 @@ begin
                 );
     end generate;
 
+    bsort_0: entity work.bit_sorter
+        port map (
+            clk => clk,
+            rst => rst,
+            rs => c_in,
+            rb => b_in,
+            go => bsort_start,
+            opc => e_in.insn(7 downto 6),
+            done => bsort_done,
+            result => bsort_result
+            );
+
     random_0: entity work.random
         port map (
             clk => clk,
@@ -664,7 +684,7 @@ begin
         adder_result       when "000",
         logical_result     when "001",
         rotator_result     when "010",
-        muldiv_result      when "100",
+        multicyc_result    when "100",
         ramspr_result      when "101",
         misc_result        when others;
 
@@ -845,17 +865,21 @@ begin
         x_to_mult_32s.subtract <= '0';
         x_to_mult_32s.addend <= (others => '0');
 
-        case ex1.mul_select is
-            when "00" =>
-                muldiv_result <= multiply_to_x.result(63 downto 0);
-            when "01" =>
-                muldiv_result <= multiply_to_x.result(127 downto 64);
-            when "10" =>
-                muldiv_result <= multiply_to_x.result(63 downto 32) &
-                                 multiply_to_x.result(63 downto 32);
-            when others =>
-                muldiv_result <= divider_to_x.write_reg_data;
-        end case;
+        if ex1.mul_select(2) = '0' then
+            case ex1.mul_select(1 downto 0) is
+                when "00" =>
+                    multicyc_result <= multiply_to_x.result(63 downto 0);
+                when "01" =>
+                    multicyc_result <= multiply_to_x.result(63 downto 32) &
+                                       multiply_to_x.result(63 downto 32);
+                when others =>
+                    multicyc_result <= multiply_to_x.result(127 downto 64);
+            end case;
+        elsif ex1.mul_select(0) = '1' and not HAS_FPU then
+            multicyc_result <= divider_to_x.write_reg_data;
+        else
+            multicyc_result <= bsort_result;
+        end if;
 
         -- Compute misc_result
         case e_in.sub_select is
@@ -1266,7 +1290,7 @@ begin
                 end if;
                 v.do_trace := '0';
 
-            when OP_CNTZ | OP_POPCNT =>
+            when OP_COUNTB =>
                 v.res2_sel := "01";
                 slow_op := '1';
 	    when OP_ISEL =>
@@ -1388,6 +1412,11 @@ begin
 	    when OP_ICBI =>
 		v.se.icache_inval := '1';
 
+            when OP_BSORT =>
+                v.start_bsort := '1';
+                slow_op := '1';
+                owait := '1';
+
 	    when OP_MUL_L64 =>
                 if e_in.is_32bit = '1' then
                     v.se.mult_32s := '1';
@@ -1565,7 +1594,7 @@ begin
             v.oe := e_in.oe;
             v.spr_select := e_in.spr_select;
             v.pmu_spr_num := e_in.insn(20 downto 16);
-            v.mul_select := e_in.sub_select(1 downto 0);
+            v.mul_select := e_in.sub_select;
             v.se := side_effect_init;
             v.ramspr_wraddr := e_in.ramspr_wraddr;
             v.lr_from_next := e_in.lr;
@@ -1596,7 +1625,7 @@ begin
 	rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0';
         rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0';
 
-        do_popcnt <= '1' when e_in.insn_type = OP_POPCNT else '0';
+        do_popcnt <= '1' when e_in.insn_type = OP_COUNTB and e_in.insn(7 downto 6) = "11" else '0';
 
         if valid_in = '1' then
             v.prev_op := e_in.insn_type;
@@ -1671,6 +1700,7 @@ begin
             v.mul_in_progress := actions.start_mul;
             x_to_divider.valid <= actions.start_div;
             v.div_in_progress := actions.start_div;
+            v.bsort_in_progress := actions.start_bsort;
             v.br_mispredict := v.e.redirect and actions.direct_branch;
             v.advance_nia := actions.advance_nia;
             v.redir_to_next := actions.redir_to_next;
@@ -1681,7 +1711,7 @@ begin
             -- multiply is happening in order to stop following
             -- instructions from using the wrong XER value
             -- (and for simplicity in the OE=0 case).
-            v.busy := actions.start_div or actions.start_mul;
+            v.busy := actions.start_div or actions.start_mul or actions.start_bsort;
 
             -- instruction for other units, i.e. LDST
             if e_in.unit = LDST then
@@ -1692,6 +1722,7 @@ begin
             end if;
         end if;
         is_scv := go and actions.se.scv_trap;
+        bsort_start <= go and actions.start_bsort;
 
         if not HAS_FPU and ex1.div_in_progress = '1' then
             v.div_in_progress := not divider_to_x.valid;
@@ -1724,6 +1755,13 @@ begin
             end if;
             v.e.valid := '1';
         end if;
+        if ex1.bsort_in_progress = '1' then
+            v.bsort_in_progress := not bsort_done;
+            v.e.valid := bsort_done;
+            v.busy := not bsort_done;
+            v.e.write_data := alu_result;
+            bypass_valid := bsort_done;
+        end if;
 
         if v.e.write_xerc_enable = '1' and v.e.valid = '1' then
             v.xerc := v.e.xerc;
diff --git a/microwatt.core b/microwatt.core
index dad180f..f56bee0 100644
--- a/microwatt.core
+++ b/microwatt.core
@@ -20,6 +20,7 @@ filesets:
       - sim_console.vhdl
       - logical.vhdl
       - countbits.vhdl
+      - bitsort.vhdl
       - control.vhdl
       - execute1.vhdl
       - fpu.vhdl
diff --git a/predecode.vhdl b/predecode.vhdl
index 1846e3c..65cb751 100644
--- a/predecode.vhdl
+++ b/predecode.vhdl
@@ -219,6 +219,7 @@ architecture behaviour of predecoder is
         2#0_00101_11011#  =>  INSN_brd,
         2#0_01001_11010#  =>  INSN_cbcdtd,
         2#0_01000_11010#  =>  INSN_cdtbcd,
+        2#0_00110_11100#  =>  INSN_cfuged,
         2#0_00000_00000#  =>  INSN_cmp,
         2#0_01111_11100#  =>  INSN_cmpb,
         2#0_00111_00000#  =>  INSN_cmpeqb,
@@ -363,6 +364,8 @@ architecture behaviour of predecoder is
         2#0_00011_11100#  =>  INSN_nor,
         2#0_01101_11100#  =>  INSN_or,
         2#0_01100_11100#  =>  INSN_orc,
+        2#0_00100_11100#  =>  INSN_pdepd,
+        2#0_00101_11100#  =>  INSN_pextd,
         2#0_00011_11010#  =>  INSN_popcntb,
         2#0_01111_11010#  =>  INSN_popcntd,
         2#0_01011_11010#  =>  INSN_popcntw,
diff --git a/scripts/fmt_log/fmt_log.c b/scripts/fmt_log/fmt_log.c
index 226cfbe..aa0573a 100644
--- a/scripts/fmt_log/fmt_log.c
+++ b/scripts/fmt_log/fmt_log.c
@@ -87,11 +87,11 @@ const char *units[4] = { "al", "ls", "fp", "3?" };
 const char *ops[64] =
 {
 	"illegal", "nop    ", "add    ", "attn   ", "b      ", "bc     ", "bcreg  ", "bcd    ",
-	"bperm  ", "brev   ", "cmp    ", "cmpb   ", "cmpeqb ", "cmprb  ", "cntz   ", "crop   ",
+	"bperm  ", "brev   ", "cmp    ", "cmpb   ", "cmpeqb ", "cmprb  ", "countb ", "crop   ",
 	"darn   ", "dcbf   ", "dcbst  ", "xcbt   ", "dcbtst ", "dcbz   ", "icbi   ", "fpcmp  ",
 	"fparith", "fpmove ", "fpmisc ", "div    ", "dive   ", "mod    ", "exts   ", "extswsl",
 	"isel   ", "isync  ", "logic  ", "ld     ", "st     ", "mcrxrx ", "mfcr   ", "mfmsr  ",
-	"mfspr  ", "mtcrf  ", "mtmsr  ", "mtspr  ", "mull64 ", "mulh64 ", "mulh32 ", "popcnt ",
+	"mfspr  ", "mtcrf  ", "mtmsr  ", "mtspr  ", "mull64 ", "mulh64 ", "mulh32 ", "bsort  ",
 	"prty   ", "rfid   ", "rlc    ", "rlcl   ", "rlcr   ", "sc     ", "setb   ", "shl    ",
 	"shr    ", "sync   ", "tlbie  ", "trap   ", "xor    ", "addg6s ", "wait   ", "ffail  ",
 };