hrw/hrw.go
Evgenii Stratonikov 16a7740ccd [#8] hrw: Introduce StringHash() for hashing strings directly
```
goos: linux
goarch: amd64
pkg: git.frostfs.info/TrueCloudLab/hrw
cpu: 11th Gen Intel(R) Core(TM) i5-1135G7 @ 2.40GHz
                                         │      6      │                  7                   │
                                         │   sec/op    │    sec/op     vs base                │
SortHashersByValue_Typed_fnv_10-8          248.8n ± 1%   166.9n ±  9%  -32.93% (p=0.000 n=10)
SortHashersByValue_Typed_fnv_100-8         2.195µ ± 6%   1.297µ ±  6%  -40.93% (p=0.000 n=10)
SortHashersByValue_Typed_fnv_1000-8        22.47µ ± 4%   12.42µ ± 10%  -44.73% (p=0.000 n=10)
SortHashersByWeightValueTyped_fnv_10-8     301.7n ± 6%   180.8n ±  4%  -40.09% (p=0.000 n=10)
SortHashersByWeightValueTyped_fnv_100-8    2.526µ ± 1%   1.378µ ±  5%  -45.47% (p=0.000 n=10)
SortHashersByWeightValueTyped_fnv_1000-8   24.37µ ± 2%   12.80µ ±  4%  -47.48% (p=0.000 n=10)
geomean                                    2.472µ        1.430µ        -42.13%

                                         │      6       │                   7                   │
                                         │     B/op     │     B/op      vs base                 │
SortHashersByValue_Typed_fnv_10-8            144.0 ± 0%     144.0 ± 0%       ~ (p=1.000 n=10) ¹
SortHashersByValue_Typed_fnv_100-8           960.0 ± 0%     960.0 ± 0%       ~ (p=1.000 n=10) ¹
SortHashersByValue_Typed_fnv_1000-8        8.062Ki ± 0%   8.062Ki ± 0%       ~ (p=1.000 n=10) ¹
SortHashersByWeightValueTyped_fnv_10-8       144.0 ± 0%     144.0 ± 0%       ~ (p=1.000 n=10) ¹
SortHashersByWeightValueTyped_fnv_100-8      960.0 ± 0%     960.0 ± 0%       ~ (p=1.000 n=10) ¹
SortHashersByWeightValueTyped_fnv_1000-8   8.062Ki ± 0%   8.062Ki ± 0%       ~ (p=1.000 n=10) ¹
geomean                                    1.021Ki        1.021Ki       +0.00%
¹ all samples are equal

                                         │     6      │                  7                  │
                                         │ allocs/op  │ allocs/op   vs base                 │
SortHashersByValue_Typed_fnv_10-8          2.000 ± 0%   2.000 ± 0%       ~ (p=1.000 n=10) ¹
SortHashersByValue_Typed_fnv_100-8         2.000 ± 0%   2.000 ± 0%       ~ (p=1.000 n=10) ¹
SortHashersByValue_Typed_fnv_1000-8        2.000 ± 0%   2.000 ± 0%       ~ (p=1.000 n=10) ¹
SortHashersByWeightValueTyped_fnv_10-8     2.000 ± 0%   2.000 ± 0%       ~ (p=1.000 n=10) ¹
SortHashersByWeightValueTyped_fnv_100-8    2.000 ± 0%   2.000 ± 0%       ~ (p=1.000 n=10) ¹
SortHashersByWeightValueTyped_fnv_1000-8   2.000 ± 0%   2.000 ± 0%       ~ (p=1.000 n=10) ¹
geomean                                    2.000        2.000       +0.00%
¹ all samples are equal
```

Signed-off-by: Evgenii Stratonikov <e.stratonikov@yadro.com>
2023-06-02 10:56:24 +03:00

361 lines
9.2 KiB
Go

// Package hrw implements Rendezvous hashing.
// http://en.wikipedia.org/wiki/Rendezvous_hashing.
package hrw
import (
"encoding/binary"
"errors"
"math"
"reflect"
"sort"
"github.com/twmb/murmur3"
)
type (
// Hasher interface used by SortSliceByValue
Hasher interface{ Hash() uint64 }
sorter struct {
l int
less func(i, j int) bool
swap func(i, j int)
}
hasherSorter[T Hasher, N interface{ ~uint64 | ~float64 }] struct {
slice []T
dist []N
asc bool
}
)
// Boundaries of valid normalized weights
const (
NormalizedMaxWeight = 1.0
NormalizedMinWeight = 0.0
)
func (s *sorter) Len() int { return s.l }
func (s *sorter) Less(i, j int) bool { return s.less(i, j) }
func (s *sorter) Swap(i, j int) { s.swap(i, j) }
func (s *hasherSorter[T, N]) Len() int { return len(s.slice) }
func (s *hasherSorter[T, N]) Less(i, j int) bool {
if s.asc {
return s.dist[i] < s.dist[j]
}
return s.dist[i] > s.dist[j]
}
func (s *hasherSorter[T, N]) Swap(i, j int) {
s.slice[i], s.slice[j] = s.slice[j], s.slice[i]
s.dist[i], s.dist[j] = s.dist[j], s.dist[i]
}
func distance(x uint64, y uint64) uint64 {
acc := x ^ y
// here used mmh3 64 bit finalizer
// https://github.com/aappleby/smhasher/blob/61a0530f28277f2e850bfc39600ce61d02b518de/src/MurmurHash3.cpp#L81
acc ^= acc >> 33
acc = acc * 0xff51afd7ed558ccd
acc ^= acc >> 33
acc = acc * 0xc4ceb9fe1a85ec53
acc ^= acc >> 33
return acc
}
// Hash uses murmur3 hash to return uint64
func Hash(key []byte) uint64 {
return murmur3.Sum64(key)
}
// StringHash uses murmur3 hash to return uint64
func StringHash(key string) uint64 {
return murmur3.StringSum64(key)
}
// Sort receive nodes and hash, and sort it by distance
func Sort(nodes []uint64, hash uint64) []uint64 {
l := len(nodes)
sorted := make([]uint64, l)
dist := make([]uint64, l)
for i := range nodes {
sorted[i] = uint64(i)
dist[i] = distance(nodes[i], hash)
}
sort.Slice(sorted, func(i, j int) bool {
return dist[sorted[i]] < dist[sorted[j]]
})
return sorted
}
// SortByWeight receive nodes, weights and hash, and sort it by distance * weight
func SortByWeight(nodes []uint64, weights []float64, hash uint64) []uint64 {
result := make([]uint64, len(nodes))
copy(nodes, result)
sortByWeight(len(nodes), false, nodes, weights, hash, reflect.Swapper(result))
return result
}
// SortSliceByValue received []T and hash to sort by value-distance
func SortSliceByValue(slice interface{}, hash uint64) {
rule := prepareRule(slice)
if rule != nil {
swap := reflect.Swapper(slice)
sortByDistance(len(rule), false, rule, hash, swap)
}
}
// SortHasherSliceByValue receives []Hasher and hash to sort by value-distance.
func SortHasherSliceByValue[T Hasher](slice []T, hash uint64) {
if len(slice) == 0 {
return
}
dist := make([]uint64, len(slice))
for i := range dist {
dist[i] = distance(slice[i].Hash(), hash)
}
sortHasherByDistance(slice, false, dist)
}
// SortSliceByWeightValue received []T, weights and hash to sort by value-distance * weights
func SortSliceByWeightValue(slice interface{}, weights []float64, hash uint64) {
rule := prepareRule(slice)
if rule != nil {
swap := reflect.Swapper(slice)
sortByWeight(reflect.ValueOf(slice).Len(), false, rule, weights, hash, swap)
}
}
// SortHasherSliceByWeightValue receives []Hasher, weights and hash to sort by value-distance * weights.
func SortHasherSliceByWeightValue[T Hasher](slice []T, weights []float64, hash uint64) {
if len(slice) == 0 {
return
}
if allSameF64(weights) {
dist := make([]uint64, len(slice))
for i := range dist {
dist[i] = distance(slice[i].Hash(), hash)
}
sortHasherByDistance(slice, false, dist)
return
}
dist := make([]float64, len(slice))
for i := range dist {
d := distance(slice[i].Hash(), hash)
// `maxUint64 - distance` makes the shorter distance more valuable
// it is necessary for operation with normalized values
dist[i] = float64(^uint64(0)-d) * weights[i]
}
sort.Sort(&hasherSorter[T, float64]{
slice: slice,
dist: dist,
asc: false,
})
}
// sortHasherByDistance is similar to sortByDistance but accepts slice directly.
func sortHasherByDistance[T Hasher](slice []T, byIndex bool, dist []uint64) {
sort.Sort(&hasherSorter[T, uint64]{
slice: slice,
dist: dist,
asc: true,
})
}
// SortSliceByIndex received []T and hash to sort by index-distance
func SortSliceByIndex(slice interface{}, hash uint64) {
length := reflect.ValueOf(slice).Len()
swap := reflect.Swapper(slice)
sortByDistance(length, true, nil, hash, swap)
}
// SortSliceByWeightIndex received []T, weights and hash to sort by index-distance * weights
func SortSliceByWeightIndex(slice interface{}, weights []float64, hash uint64) {
length := reflect.ValueOf(slice).Len()
swap := reflect.Swapper(slice)
sortByWeight(length, true, nil, weights, hash, swap)
}
func prepareRule(slice interface{}) []uint64 {
t := reflect.TypeOf(slice)
if t.Kind() != reflect.Slice {
panic("HRW sort expects slice, got " + t.Kind().String())
}
var (
val = reflect.ValueOf(slice)
length = val.Len()
rule = make([]uint64, 0, length)
)
if length == 0 {
return nil
}
switch slice := slice.(type) {
case []int:
var key = make([]byte, 16)
for i := 0; i < length; i++ {
binary.BigEndian.PutUint64(key, uint64(slice[i]))
rule = append(rule, Hash(key))
}
case []uint:
var key = make([]byte, 16)
for i := 0; i < length; i++ {
binary.BigEndian.PutUint64(key, uint64(slice[i]))
rule = append(rule, Hash(key))
}
case []int8:
for i := 0; i < length; i++ {
key := byte(slice[i])
rule = append(rule, Hash([]byte{key}))
}
case []uint8:
for i := 0; i < length; i++ {
key := slice[i]
rule = append(rule, Hash([]byte{key}))
}
case []int16:
var key = make([]byte, 8)
for i := 0; i < length; i++ {
binary.BigEndian.PutUint16(key, uint16(slice[i]))
rule = append(rule, Hash(key))
}
case []uint16:
var key = make([]byte, 8)
for i := 0; i < length; i++ {
binary.BigEndian.PutUint16(key, slice[i])
rule = append(rule, Hash(key))
}
case []int32:
var key = make([]byte, 16)
for i := 0; i < length; i++ {
binary.BigEndian.PutUint32(key, uint32(slice[i]))
rule = append(rule, Hash(key))
}
case []uint32:
var key = make([]byte, 16)
for i := 0; i < length; i++ {
binary.BigEndian.PutUint32(key, slice[i])
rule = append(rule, Hash(key))
}
case []int64:
var key = make([]byte, 32)
for i := 0; i < length; i++ {
binary.BigEndian.PutUint64(key, uint64(slice[i]))
rule = append(rule, Hash(key))
}
case []uint64:
var key = make([]byte, 32)
for i := 0; i < length; i++ {
binary.BigEndian.PutUint64(key, slice[i])
rule = append(rule, Hash(key))
}
case []string:
for i := 0; i < length; i++ {
rule = append(rule, Hash([]byte(slice[i])))
}
default:
if _, ok := val.Index(0).Interface().(Hasher); !ok {
panic("slice elements must implement hrw.Hasher")
}
for i := 0; i < length; i++ {
h := val.Index(i).Interface().(Hasher)
rule = append(rule, h.Hash())
}
}
return rule
}
// ValidateWeights checks if weights are normalized between 0.0 and 1.0
func ValidateWeights(weights []float64) error {
for i := range weights {
if math.IsNaN(weights[i]) || weights[i] > NormalizedMaxWeight || weights[i] < NormalizedMinWeight {
return errors.New("weights are not normalized")
}
}
return nil
}
// sortByWeight sorts nodes by weight using provided swapper.
// nodes contains hrw hashes. If it is nil, indices are used.
func sortByWeight(l int, byIndex bool, nodes []uint64, weights []float64, hash uint64, swap func(i, j int)) {
// if all nodes have the same distance then sort uniformly
if allSameF64(weights) {
sortByDistance(l, byIndex, nodes, hash, swap)
return
}
dist := make([]float64, l)
for i := 0; i < l; i++ {
d := getDistance(byIndex, i, nodes, hash)
// `maxUint64 - distance` makes the shorter distance more valuable
// it is necessary for operation with normalized values
dist[i] = float64(^uint64(0)-d) * weights[i]
}
s := &sorter{
l: l,
swap: func(i, j int) {
swap(i, j)
dist[i], dist[j] = dist[j], dist[i]
},
less: func(i, j int) bool {
return dist[i] > dist[j] // higher distance must be placed lower to be first
},
}
sort.Sort(s)
}
// sortByDistance sorts nodes by hrw distance using provided swapper.
// nodes contains hrw hashes. If it is nil, indices are used.
func sortByDistance(l int, byIndex bool, nodes []uint64, hash uint64, swap func(i, j int)) {
dist := make([]uint64, l)
for i := 0; i < l; i++ {
dist[i] = getDistance(byIndex, i, nodes, hash)
}
s := &sorter{
l: l,
swap: func(i, j int) {
swap(i, j)
dist[i], dist[j] = dist[j], dist[i]
},
less: func(i, j int) bool {
return dist[i] < dist[j]
},
}
sort.Sort(s)
}
// getDistance return distance from nodes[i] to h.
// If byIndex is true, nodes index is used.
// Else if nodes[i] != nil, distance is calculated from this value.
// Otherwise, and hash from node index is taken.
func getDistance(byIndex bool, i int, nodes []uint64, h uint64) uint64 {
if nodes != nil {
return distance(nodes[i], h)
} else if byIndex {
return distance(uint64(i), h)
} else {
buf := make([]byte, 8)
binary.LittleEndian.PutUint64(buf, uint64(i))
return distance(Hash(buf), h)
}
}
func allSameF64(fs []float64) bool {
for i := range fs {
if fs[i] != fs[0] {
return false
}
}
return true
}