diff --git a/gogf127/gf127_test.go b/gogf127/gf127_test.go
index d098925..a6addb6 100644
--- a/gogf127/gf127_test.go
+++ b/gogf127/gf127_test.go
@@ -38,6 +38,21 @@ func TestMul(t *testing.T) {
 	}
 }
 
+func TestMulInPlace(t *testing.T) {
+	for _, tc := range testCasesMul {
+		a := *tc[0]
+		b := *tc[1]
+		Mul(&a, &b, &b)
+		require.Equal(t, a, *tc[0])
+		require.Equal(t, b, *tc[2])
+
+		b = *tc[1]
+		Mul(&a, &b, &a)
+		require.Equal(t, b, *tc[1])
+		require.Equal(t, a, *tc[2])
+	}
+}
+
 var testCasesMul10 = [][2]*GF127{
 	{&GF127{123, 0}, &GF127{246, 0}},
 	{&GF127{maxUint64, 2}, &GF127{maxUint64 - 1, 5}},
diff --git a/gogf127/gogf127.go b/gogf127/gogf127.go
index 8ba33e9..10af5c4 100644
--- a/gogf127/gogf127.go
+++ b/gogf127/gogf127.go
@@ -160,35 +160,24 @@ func Add(a, b, c *GF127) {
 }
 
 // Mul sets c to a*b.
-// TODO make it work in-place without allocations
-// TODO optimization: no need to perform shift by i every time, cache results
 func Mul(a, b, c *GF127) {
 	r := new(GF127)
-	d := new(GF127)
+	d := *a
 	for i := uint(0); i < 64; i++ {
 		if b[0]&(1<<i) != 0 {
-			shl(i, a, d)
-			Add(r, d, r)
+			Add(r, &d, r)
 		}
+		Mul10(&d, &d)
 	}
 	for i := uint(0); i < 63; i++ {
 		if b[1]&(1<<i) != 0 {
-			shl(i+64, a, d)
-			Add(r, d, r)
+			Add(r, &d, r)
 		}
+		Mul10(&d, &d)
 	}
 	*c = *r
 }
 
-// shl performs left shift by consecutive multiplications by 2.
-func shl(count uint, a, b *GF127) {
-	b[0] = a[0]
-	b[1] = a[1]
-	for i := uint(0); i < count; i++ {
-		Mul10(b, b)
-	}
-}
-
 // Mul10 sets b to a*x.
 func Mul10(a, b *GF127) {
 	c := (a[0] & msb64) >> 63