diff --git a/decode1.vhdl b/decode1.vhdl
index 7163ff9..e821469 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -419,6 +419,7 @@ architecture behaviour of decode1 is
         2#10010#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fdivs
         2#10100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fsubs
         2#10101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fadds
+        2#10110#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fsqrts
         2#11000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fres
         2#11001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmuls
         2#11010#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- frsqrtes
@@ -477,6 +478,7 @@ architecture behaviour of decode1 is
         2#0010#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fdiv
         2#0100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsub
         2#0101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fadd
+        2#0110#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsqrt
         2#0111#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsel
         2#1000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fre
         2#1001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmul
diff --git a/fpu.vhdl b/fpu.vhdl
index 0cbd43f..244454e 100644
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -40,7 +40,7 @@ architecture behaviour of fpu is
                      DO_FMR, DO_FMRG, DO_FCMP,
                      DO_FCFID, DO_FCTI,
                      DO_FRSP, DO_FRI,
-                     DO_FADD, DO_FMUL, DO_FDIV,
+                     DO_FADD, DO_FMUL, DO_FDIV, DO_FSQRT,
                      DO_FRE, DO_FRSQRTE,
                      DO_FSEL,
                      FRI_1,
@@ -51,6 +51,9 @@ architecture behaviour of fpu is
                      DIV_2, DIV_3, DIV_4, DIV_5, DIV_6,
                      FRE_1,
                      RSQRT_1,
+                     SQRT_1, SQRT_2, SQRT_3, SQRT_4,
+                     SQRT_5, SQRT_6, SQRT_7, SQRT_8,
+                     SQRT_9, SQRT_10, SQRT_11, SQRT_12,
                      INT_SHIFT, INT_ROUND, INT_ISHIFT,
                      INT_FINAL, INT_CHECK, INT_OFLOW,
                      FINISH, NORMALIZE,
@@ -140,6 +143,7 @@ architecture behaviour of fpu is
     constant BIN_ZERO : std_ulogic_vector(1 downto 0) := "00";
     constant BIN_R    : std_ulogic_vector(1 downto 0) := "01";
     constant BIN_MASK : std_ulogic_vector(1 downto 0) := "10";
+    constant BIN_PS6  : std_ulogic_vector(1 downto 0) := "11";
 
     constant RES_SUM   : std_ulogic_vector(1 downto 0) := "00";
     constant RES_SHIFT : std_ulogic_vector(1 downto 0) := "01";
@@ -604,6 +608,7 @@ begin
         variable pshift      : std_ulogic;
         variable renorm_sqrt : std_ulogic;
         variable sqrt_exp    : signed(EXP_BITS-1 downto 0);
+        variable shiftin     : std_ulogic;
     begin
         v := r;
         illegal := '0';
@@ -717,6 +722,7 @@ begin
         set_y := '0';
         pshift := '0';
         renorm_sqrt := '0';
+        shiftin := '0';
         case r.state is
             when IDLE =>
                 if e_in.valid = '1' then
@@ -765,6 +771,9 @@ begin
                             v.state := DO_FDIV;
                         when "10100" | "10101" =>
                             v.state := DO_FADD;
+                        when "10110" =>
+                            v.is_sqrt := '1';
+                            v.state := DO_FSQRT;
                         when "10111" =>
                             v.state := DO_FSEL;
                         when "11000" =>
@@ -1248,6 +1257,43 @@ begin
                 v.quieten_nan := '0';
                 arith_done := '1';
 
+            when DO_FSQRT =>
+                opsel_a <= AIN_B;
+                v.result_class := r.b.class;
+                v.result_sign := r.b.negative;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                if r.b.class = NAN and r.b.mantissa(53) = '0' then
+                    v.fpscr(FPSCR_VXSNAN) := '1';
+                    invalid := '1';
+                end if;
+                case r.b.class is
+                    when FINITE =>
+                        v.result_exp := r.b.exponent;
+                        if r.b.negative = '1' then
+                            v.fpscr(FPSCR_VXSQRT) := '1';
+                            qnan_result := '1';
+                            arith_done := '1';
+                        elsif r.b.mantissa(54) = '0' then
+                            v.state := RENORM_B;
+                        elsif r.b.exponent(0) = '0' then
+                            v.state := SQRT_1;
+                        else
+                            v.shift := to_signed(1, EXP_BITS);
+                            v.state := RENORM_B2;
+                        end if;
+                    when NAN | ZERO =>
+                        -- result is B
+                        arith_done := '1';
+                    when INFINITY =>
+                        if r.b.negative = '1' then
+                            v.fpscr(FPSCR_VXSQRT) := '1';
+                            qnan_result := '1';
+                        -- else result is B
+                        end if;
+                        arith_done := '1';
+                end case;
+
             when DO_FRE =>
                 opsel_a <= AIN_B;
                 v.result_class := r.b.class;
@@ -1454,7 +1500,11 @@ begin
                 -- wait one cycle for inverse_table[B] lookup
                 v.first := '1';
                 if r.insn(4) = '0' then
-                    v.state := DIV_2;
+                    if r.insn(3) = '0' then
+                        v.state := DIV_2;
+                    else
+                        v.state := SQRT_1;
+                    end if;
                 elsif r.insn(2) = '0' then
                     v.state := FRE_1;
                 else
@@ -1545,6 +1595,156 @@ begin
                 v.shift := to_signed(1, EXP_BITS);
                 v.state := NORMALIZE;
 
+            when SQRT_1 =>
+                -- put invsqr[B] in R and compute P = invsqr[B] * B
+                -- also transfer B (in R) to A
+                set_a := '1';
+                opsel_r <= RES_MISC;
+                misc_sel <= "0111";
+                msel_1 <= MUL1_B;
+                msel_2 <= MUL2_LUT;
+                f_to_multiply.valid <= '1';
+                v.shift := to_signed(-1, EXP_BITS);
+                v.count := "00";
+                v.state := SQRT_2;
+
+            when SQRT_2 =>
+                -- shift R right one place
+                -- not expecting multiplier result yet
+                opsel_r <= RES_SHIFT;
+                v.first := '1';
+                v.state := SQRT_3;
+
+            when SQRT_3 =>
+                -- put R into Y, wait for product from multiplier
+                msel_2 <= MUL2_R;
+                set_y := r.first;
+                pshift := '1';
+                if multiply_to_f.valid = '1' then
+                    -- put result into R
+                    opsel_r <= RES_MULT;
+                    v.first := '1';
+                    v.state := SQRT_4;
+                end if;
+
+            when SQRT_4 =>
+                -- compute 1.5 - Y * P
+                msel_1 <= MUL1_Y;
+                msel_2 <= MUL2_P;
+                msel_add <= MULADD_CONST;
+                msel_inv <= '1';
+                f_to_multiply.valid <= r.first;
+                pshift := '1';
+                if multiply_to_f.valid = '1' then
+                    v.state := SQRT_5;
+                end if;
+
+            when SQRT_5 =>
+                -- compute Y = Y * P
+                msel_1 <= MUL1_Y;
+                msel_2 <= MUL2_P;
+                f_to_multiply.valid <= '1';
+                v.first := '1';
+                v.state := SQRT_6;
+
+            when SQRT_6 =>
+                -- pipeline in R = R * P
+                msel_1 <= MUL1_R;
+                msel_2 <= MUL2_P;
+                f_to_multiply.valid <= r.first;
+                pshift := '1';
+                if multiply_to_f.valid = '1' then
+                    v.first := '1';
+                    v.state := SQRT_7;
+                end if;
+
+            when SQRT_7 =>
+                -- first multiply is done, put result in Y
+                msel_2 <= MUL2_P;
+                set_y := r.first;
+                -- wait for second multiply (should be here already)
+                pshift := '1';
+                if multiply_to_f.valid = '1' then
+                    -- put result into R
+                    opsel_r <= RES_MULT;
+                    v.first := '1';
+                    v.count := r.count + 1;
+                    if r.count < 2 then
+                        v.state := SQRT_4;
+                    else
+                        v.first := '1';
+                        v.state := SQRT_8;
+                    end if;
+                end if;
+
+            when SQRT_8 =>
+                -- compute P = A - R * R, which can be +ve or -ve
+                -- we arranged for B to be put into A earlier
+                msel_1 <= MUL1_R;
+                msel_2 <= MUL2_R;
+                msel_add <= MULADD_A;
+                msel_inv <= '1';
+                pshift := '1';
+                f_to_multiply.valid <= r.first;
+                if multiply_to_f.valid = '1' then
+                    v.first := '1';
+                    v.state := SQRT_9;
+                end if;
+
+            when SQRT_9 =>
+                -- compute P = P * Y
+                -- since Y is an estimate of 1/sqrt(B), this makes P an
+                -- estimate of the adjustment needed to R.  Since the error
+                -- could be negative and we have an unsigned multiplier, the
+                -- upper bits can be wrong, but it turns out the lowest 8 bits
+                -- are correct and are all we need (given 3 iterations through
+                -- SQRT_4 to SQRT_7).
+                msel_1 <= MUL1_Y;
+                msel_2 <= MUL2_P;
+                pshift := '1';
+                f_to_multiply.valid <= r.first;
+                if multiply_to_f.valid = '1' then
+                    v.state := SQRT_10;
+                end if;
+
+            when SQRT_10 =>
+                -- Add the bottom 8 bits of P, sign-extended,
+                -- divided by 4, onto R.
+                -- The division by 4 is because R is 10.54 format
+                -- whereas P is 8.56 format.
+                opsel_b <= BIN_PS6;
+                sqrt_exp := r.b.exponent(EXP_BITS-1) & r.b.exponent(EXP_BITS-1 downto 1);
+                v.result_exp := sqrt_exp;
+                v.shift := to_signed(1, EXP_BITS);
+                v.first := '1';
+                v.state := SQRT_11;
+
+            when SQRT_11 =>
+                -- compute P = A - R * R (remainder)
+                -- also put 2 * R + 1 into B for comparison with P
+                msel_1 <= MUL1_R;
+                msel_2 <= MUL2_R;
+                msel_add <= MULADD_A;
+                msel_inv <= '1';
+                f_to_multiply.valid <= r.first;
+                shiftin := '1';
+                set_b := r.first;
+                if multiply_to_f.valid = '1' then
+                    v.state := SQRT_12;
+                end if;
+
+            when SQRT_12 =>
+                -- test if remainder is 0 or >= B = 2*R + 1
+                if pcmpb_lt = '1' then
+                    -- square root is correct, set X if remainder non-zero
+                    v.x := r.p(58) or px_nz;
+                else
+                    -- square root needs to be incremented by 1
+                    carry_in <= '1';
+                    v.x := not pcmpb_eq;
+                end if;
+                v.state := FINISH;
+
             when INT_SHIFT =>
                 opsel_r <= RES_SHIFT;
                 set_x := '1';
@@ -1828,8 +2028,12 @@ begin
         maddend := (others => '0');
         case msel_add is
             when MULADD_CONST =>
-                -- addend is 2.0 in 16.112 format
-                maddend(113) := '1';                -- 2.0
+                -- addend is 2.0 or 1.5 in 16.112 format
+                if r.is_sqrt = '0' then
+                    maddend(113) := '1';                -- 2.0
+                else
+                    maddend(112 downto 111) := "11";    -- 1.5
+                end if;
             when MULADD_A =>
                 -- addend is A in 16.112 format
                 maddend(121 downto 58) := r.a.mantissa;
@@ -1895,14 +2099,15 @@ begin
             when BIN_MASK =>
                 in_b0 := mask;
             when others =>
-                in_b0 := (others => '0');
+                -- BIN_PS6, 6 LSBs of P/4 sign-extended to 64
+                in_b0 := std_ulogic_vector(resize(signed(r.p(7 downto 2)), 64));
         end case;
         if opsel_binv = '1' then
             in_b0 := not in_b0;
         end if;
         in_b <= in_b0;
         if r.shift >= to_signed(-64, EXP_BITS) and r.shift <= to_signed(63, EXP_BITS) then
-            shift_res := shifter_64(r.r & x"00000000000000",
+            shift_res := shifter_64(r.r & shiftin & 55x"00000000000000",
                                     std_ulogic_vector(r.shift(6 downto 0)));
         else
             shift_res := (others => '0');
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c
index d9c5c06..b72b01e 100644
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -1291,6 +1291,53 @@ int fpu_test_21(void)
 	return trapit(0, test21);
 }
 
+struct sqrtvals {
+	unsigned long val;
+	unsigned long inv;
+} sqrtvals[] = {
+	{ 0x0000000000000000, 0x0000000000000000 },
+	{ 0x8000000000000000, 0x8000000000000000 },
+	{ 0xfff0000000000000, 0x7ff8000000000000 },
+	{ 0x7ff0000000000000, 0x7ff0000000000000 },
+	{ 0xfff123456789abcd, 0xfff923456789abcd },
+	{ 0x3ff0000000000000, 0x3ff0000000000000 },
+	{ 0x4000000000000000, 0x3ff6a09e667f3bcd },
+	{ 0x4010000000000000, 0x4000000000000000 },
+	{ 0xbff0000000000000, 0x7ff8000000000000 },
+	{ 0x4008000000000000, 0x3ffbb67ae8584caa },
+	{ 0x7fd0000000000000, 0x5fe0000000000000 },
+	{ 0x0008000000000000, 0x1ff6a09e667f3bcd },
+	{ 0x0004000000000000, 0x1ff0000000000000 },
+	{ 0x0002000000000000, 0x1fe6a09e667f3bcd },
+	{ 0x0000000000000002, 0x1e66a09e667f3bcd },
+	{ 0x0000000000000001, 0x1e60000000000000 },
+};
+
+int test22(long arg)
+{
+	long i;
+	unsigned long result;
+	struct sqrtvals *vp = sqrtvals;
+
+	set_fpscr(FPS_RN_NEAR);
+	for (i = 0; i < sizeof(sqrtvals) / sizeof(sqrtvals[0]); ++i, ++vp) {
+		asm("lfd 6,0(%0); fsqrt 7,6; stfd 7,0(%1)"
+		    : : "b" (&vp->val), "b" (&result) : "memory");
+		if (result != vp->inv) {
+			print_hex(i, 2, " ");
+			print_hex(result, 16, " ");
+			return i + 1;
+		}
+	}
+	return 0;
+}
+
+int fpu_test_22(void)
+{
+	enable_fp();
+	return trapit(0, test22);
+}
+
 int fail = 0;
 
 void do_test(int num, int (*test)(void))
@@ -1337,6 +1384,7 @@ int main(void)
 	do_test(19, fpu_test_19);
 	do_test(20, fpu_test_20);
 	do_test(21, fpu_test_21);
+	do_test(22, fpu_test_22);
 
 	return fail;
 }
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin
index 0253720..e378341 100755
Binary files a/tests/test_fpu.bin and b/tests/test_fpu.bin differ
diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out
index b6bc733..9b97cb5 100644
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -19,3 +19,4 @@ test 18:PASS
 test 19:PASS
 test 20:PASS
 test 21:PASS
+test 22:PASS