afe5a2a9b7
Co-authored-by: Nicolas De Loof <nicolas.deloof@gmail.com> Co-authored-by: Sebastiaan van Stijn <github@gone.nl> Signed-off-by: Laura Brehm <laurabrehm@hey.com>
85 lines
1.8 KiB
Go
85 lines
1.8 KiB
Go
package compress
|
|
|
|
import "math"
|
|
|
|
// Estimate returns a normalized compressibility estimate of block b.
|
|
// Values close to zero are likely uncompressible.
|
|
// Values above 0.1 are likely to be compressible.
|
|
// Values above 0.5 are very compressible.
|
|
// Very small lengths will return 0.
|
|
func Estimate(b []byte) float64 {
|
|
if len(b) < 16 {
|
|
return 0
|
|
}
|
|
|
|
// Correctly predicted order 1
|
|
hits := 0
|
|
lastMatch := false
|
|
var o1 [256]byte
|
|
var hist [256]int
|
|
c1 := byte(0)
|
|
for _, c := range b {
|
|
if c == o1[c1] {
|
|
// We only count a hit if there was two correct predictions in a row.
|
|
if lastMatch {
|
|
hits++
|
|
}
|
|
lastMatch = true
|
|
} else {
|
|
lastMatch = false
|
|
}
|
|
o1[c1] = c
|
|
c1 = c
|
|
hist[c]++
|
|
}
|
|
|
|
// Use x^0.6 to give better spread
|
|
prediction := math.Pow(float64(hits)/float64(len(b)), 0.6)
|
|
|
|
// Calculate histogram distribution
|
|
variance := float64(0)
|
|
avg := float64(len(b)) / 256
|
|
|
|
for _, v := range hist {
|
|
Δ := float64(v) - avg
|
|
variance += Δ * Δ
|
|
}
|
|
|
|
stddev := math.Sqrt(float64(variance)) / float64(len(b))
|
|
exp := math.Sqrt(1 / float64(len(b)))
|
|
|
|
// Subtract expected stddev
|
|
stddev -= exp
|
|
if stddev < 0 {
|
|
stddev = 0
|
|
}
|
|
stddev *= 1 + exp
|
|
|
|
// Use x^0.4 to give better spread
|
|
entropy := math.Pow(stddev, 0.4)
|
|
|
|
// 50/50 weight between prediction and histogram distribution
|
|
return math.Pow((prediction+entropy)/2, 0.9)
|
|
}
|
|
|
|
// ShannonEntropyBits returns the number of bits minimum required to represent
|
|
// an entropy encoding of the input bytes.
|
|
// https://en.wiktionary.org/wiki/Shannon_entropy
|
|
func ShannonEntropyBits(b []byte) int {
|
|
if len(b) == 0 {
|
|
return 0
|
|
}
|
|
var hist [256]int
|
|
for _, c := range b {
|
|
hist[c]++
|
|
}
|
|
shannon := float64(0)
|
|
invTotal := 1.0 / float64(len(b))
|
|
for _, v := range hist[:] {
|
|
if v > 0 {
|
|
n := float64(v)
|
|
shannon += math.Ceil(-math.Log2(n*invTotal) * n)
|
|
}
|
|
}
|
|
return int(math.Ceil(shannon))
|
|
}
|