Add pool Update #204

Closed
achuprov wants to merge 2 commits from achuprov/frostfs-sdk-go:feat/pool_update into master
2 changed files with 279 additions and 14 deletions
Showing only changes of commit ce038b04df - Show all commits

View file

@ -1871,9 +1871,19 @@ func (p *Pool) DeleteObject(ctx context.Context, prm PrmObjectDelete) error {
// See also InitParameters.SetClientRebalanceInterval.
func (p *Pool) Dial(ctx context.Context) error {
dstepanov-yadro marked this conversation as resolved Outdated

Hm, does it work? You pass nil as existClients, but in p.dial there is no nil validation

Hm, does it work? You pass `nil` as `existClients`, but in `p.dial` there is no nil validation

If existClients is nil, then ok will be false.
d37d313460/pool/pool.go (L1893)

If `existClients` is `nil`, then `ok` will be `false`. https://git.frostfs.info/TrueCloudLab/frostfs-sdk-go/src/commit/d37d31346095c2a9683cc0161546404e1bcb8a99/pool/pool.go#L1893
aarifullin marked this conversation as resolved Outdated

Hm, I don't see any reason to create the context with cancellation and pass it out by p.cancel = cancel: here are no functions that are startup-ed within goroutine

Hm, I don't see any reason to create the context with cancellation and pass it out by `p.cancel = cancel`: here are no functions that are startup-ed within goroutine

You're right. Fixed. Now, p.cancel is used to stop the rebalance.

You're right. Fixed. Now, `p.cancel` is used to stop the `rebalance`.
pool := p.pool.Load()
err := pool.Dial(ctx)
if err := pool.Dial(ctx); err != nil {
return err
}
pool.cancelLock.Lock()
fyrchik marked this conversation as resolved Outdated

If err != nil, we are leaking goroutine here (err on dial should not require any explicit Close)

If `err != nil`, we are leaking goroutine here (`err` on dial should not require any explicit `Close`)

Already fixed

Already fixed
if pool.cancel != nil {
dkirillov marked this conversation as resolved Outdated

Do we really need check this (especially without thread-safe acquiring)? We can add comment that calling this method multiple time leads to undefined behavior and client code must not operate pool this way.

Do we really need check this (especially without thread-safe acquiring)? We can add comment that calling this method multiple time leads to undefined behavior and client code must not operate pool this way.

fixed

fixed
fyrchik marked this conversation as resolved Outdated

Why not just check if err != nil?
Then Dial can ensure that it doesn't start any routines unless nil is returned (this was the previous behaviour I believe).

Why not just check `if err != nil`? Then `Dial` can ensure that it doesn't start any routines unless nil is returned (this was the previous behaviour I believe).

It seems this comment is also not relevant anymore. We initiate rebalance if Dial does not return an error.
9b05e56e52/pool/pool.go (L1877)

It seems this comment is also not relevant anymore. We initiate `rebalance` if `Dial` does not return an error. https://git.frostfs.info/TrueCloudLab/frostfs-sdk-go/src/commit/9b05e56e52623f8a892e73f804b2cba157d43e7b/pool/pool.go#L1877
dkirillov marked this conversation as resolved Outdated

Can we don't put empty line above?
Probably we can write if err := pool.Dial(ctx); err != nil {

Can we don't put empty line above? Probably we can write `if err := pool.Dial(ctx); err != nil {`

fixed

fixed
pool.cancel()
}
pool.cancelLock.Unlock()
dkirillov marked this conversation as resolved Outdated

Why do we always start rebalance?

It seems we should return error after pool.Dial (if error is occurred) immediately

Why do we always start rebalance? It seems we should return error after `pool.Dial` (if error is occurred) immediately

fixed

fixed
pool.startRebalance(ctx)
dkirillov marked this conversation as resolved Outdated

Actually we should guard if pool.canecl != nil { too

Actually we should guard `if pool.canecl != nil {` too

Yes, this check is necessary. On the first invocation of Dial, pool.cancel will be nil. If Dial is called again, we can properly stop the rebalance.

Yes, this check is necessary. On the first invocation of `Dial`, `pool.cancel` will be `nil`. If `Dial` is called again, we can properly stop the rebalance.

I meant that if we guard pool.cancel() by mutex, we also must guard pool.cancel != nil by mutex

I meant that if we guard `pool.cancel()` by mutex, we also must guard `pool.cancel != nil` by mutex

Oh, you're correct. I've corrected it.

Oh, you're correct. I've corrected it.
return err
return nil
}
// FindSiblingByParentID implements relations.Relations.
@ -2004,6 +2014,27 @@ func (p *Pool) Statistic() Statistic {
return p.pool.Load().Statistic()
}
// Update is a method that lets you refresh the list of nodes without recreating the pool.
// Use a long-lived context to avoid early rebalance stop.
// Can interrupt an operation being performed on a node that was removed.
// Ensures that:
// 1) Preserved connections would not be closed.
// 2) In the event of an error, the pool remains operational.
func (p *Pool) Update(ctx context.Context, prm []NodeParam) error {
pool := p.pool.Load()
newPool, equal, err := pool.update(ctx, prm)
dkirillov marked this conversation as resolved Outdated

Do we really need stop rebalancing even if new pool equals to old one?

Do we really need stop rebalancing even if new pool equals to old one?

fixed

fixed
if equal || err != nil {
return err
}
newPool.startRebalance(ctx)
oldPool := p.pool.Swap(newPool)
oldPool.stopRebalance()
return nil
}
type pool struct {
innerPools []*innerPool
key *ecdsa.PrivateKey
@ -2073,8 +2104,6 @@ func newPool(options InitParameters) (*pool, error) {
}
func (p *pool) Dial(ctx context.Context) error {
p.stopRebalance()
err := p.dial(ctx, nil)
if err != nil {
return err
@ -2142,6 +2171,64 @@ func (p *pool) dial(ctx context.Context, existingClients map[string]client) erro
return nil
}
func nodesParamEqual(a, b []*nodesParam) bool {
if len(a) != len(b) {
return false
}
fyrchik marked this conversation as resolved Outdated

No need for comment here, quite obvious from the code.

No need for comment here, quite obvious from the code.

I don't think this is quite obvious. We need to look in dial method to find out that existClients map is being changed.

I don't think this is quite obvious. We need to look in `dial` method to find out that `existClients` map is being changed.

The comment says remove, the cycle below just closes clients.
To me the comment doesn't mention about existClients processing in dial

The comment says remove, the cycle below just closes clients. To me the comment doesn't mention about existClients processing in `dial`

fixed

fixed
for i, v := range a {
if v.priority != b[i].priority || len(v.addresses) != len(b[i].addresses) {
return false
fyrchik marked this conversation as resolved Outdated

May we use multierr for multiple errors here?
Also, it would be nice to use some dedicated error here, to avoid confusion on caller and describe it in function comment:

This function guarantees that
1. Preserved connections would not be closed.
2. On any error, old pool will remain functional, so the error may just be logged.
May we use multierr for multiple errors here? Also, it would be nice to use some dedicated error here, to avoid confusion on caller and describe it in function comment: ``` This function guarantees that 1. Preserved connections would not be closed. 2. On any error, old pool will remain functional, so the error may just be logged. ```

Added errors.Join to handle multiple errors

Added `errors.Join` to handle multiple errors
}
for j, address := range v.addresses {
if address != b[i].addresses[j] {
return false
}
}
}
return true
}
// Update requires that no other parallel operations are executed concurrently on the pool instance.
func (p *pool) update(ctx context.Context, prm []NodeParam) (*pool, bool, error) {
dkirillov marked this conversation as resolved Outdated

Comment should have the following format 'Update ...'

Comment should have the following format 'Update ...'

fixed

fixed
newPool := *p
fyrchik marked this conversation as resolved Outdated

Let's use space after // this is how all comments in this repo are written.

Let's use space after `//` this is how all comments in this repo are written.

Let's use space after // this is how all comments in this repo are written.

Let's use space after `//` this is how all comments in this repo are written.

fixed

fixed
dkirillov marked this conversation as resolved Outdated

Why is this method exported?

Why is this method exported?

fixed

fixed
existingClients := make(map[string]client)
for i, pool := range newPool.rebalanceParams.nodesParams {
for j := range pool.weights {
existingClients[pool.addresses[j]] = newPool.innerPools[i].clients[j]
}
}
nodesParams, err := adjustNodeParams(prm)
if err != nil {
return nil, false, err
}
if nodesParamEqual(newPool.rebalanceParams.nodesParams, nodesParams) {
return nil, true, err
}
newPool.rebalanceParams.nodesParams = nodesParams
err = newPool.dial(ctx, existingClients)
if err != nil {
return nil, false, err
}
// After newPool.dial(ctx, existingClients), existingClients will contain only outdated clients.
// Removing outdated clients
for _, client := range existingClients {
if clientErr := client.close(); clientErr != nil {
err = errors.Join(err, clientErr)
}
}
return &newPool, false, err
}
func (p *pool) log(level zapcore.Level, msg string, fields ...zap.Field) {
if p.logger == nil {
return
@ -2195,6 +2282,11 @@ func fillDefaultInitParams(params *InitParameters, cache *sessionCache) {
}
}
type addressWeightPair struct {
address string
weight float64
}
func adjustNodeParams(nodeParams []NodeParam) ([]*nodesParam, error) {
if len(nodeParams) == 0 {
return nil, errors.New("no FrostFS peers configured")
@ -2221,6 +2313,22 @@ func adjustNodeParams(nodeParams []NodeParam) ([]*nodesParam, error) {
return nodesParams[i].priority < nodesParams[j].priority
})
for _, nodes := range nodesParams {
addressWeightPairs := make([]addressWeightPair, len(nodes.addresses))
for i := range nodes.addresses {
addressWeightPairs[i] = addressWeightPair{address: nodes.addresses[i], weight: nodes.weights[i]}
}
sort.Slice(addressWeightPairs, func(i, j int) bool {
return addressWeightPairs[i].address < addressWeightPairs[j].address
})
for i, pair := range addressWeightPairs {
nodes.addresses[i] = pair.address
nodes.weights[i] = pair.weight
}
}
fyrchik marked this conversation as resolved Outdated

Can p.cancel ever be nil?

Can `p.cancel` ever be nil?

fixed

fixed
return nodesParams, nil
}
@ -2228,12 +2336,6 @@ func (p *pool) startRebalance(ctx context.Context) {
p.cancelLock.Lock()
defer p.cancelLock.Unlock()
// stop rebalance
if p.cancel != nil {
p.cancel()
<-p.closedCh
}
rebalanceCtx, cancel := context.WithCancel(ctx)
p.closedCh = make(chan struct{})
@ -2266,10 +2368,8 @@ func (p *pool) stopRebalance() {
p.cancelLock.Lock()
defer p.cancelLock.Unlock()
if p.cancel != nil {
p.cancel()
<-p.closedCh
}
p.cancel()
<-p.closedCh
}
func (p *pool) updateNodesHealth(ctx context.Context, buffers [][]float64) {

View file

@ -4,7 +4,9 @@ import (
"context"
"crypto/ecdsa"
"errors"
"reflect"
"strconv"
"sync"
"testing"
"time"
@ -174,6 +176,150 @@ func TestTwoNodes(t *testing.T) {
require.True(t, assertAuthKeyForAny(st, clientKeys))
}
func TestTwoNodesUpdate(t *testing.T) {
var clientKeys []*ecdsa.PrivateKey
mockClientBuilder := func(addr string) client {
key := newPrivateKey(t)
clientKeys = append(clientKeys, key)
return newMockClient(addr, *key)
}
opts := InitParameters{
key: newPrivateKey(t),
nodeParams: []NodeParam{
{2, "peer0", 1},
{2, "peer1", 1},
},
}
opts.setClientBuilder(mockClientBuilder)
pool, err := NewPool(opts)
require.NoError(t, err)
err = pool.Dial(context.Background())
require.NoError(t, err)
t.Cleanup(pool.Close)
cp, err := pool.pool.Load().connection()
require.NoError(t, err)
st, _ := pool.pool.Load().cache.Get(formCacheKey(cp.address(), pool.pool.Load().key, false))
require.True(t, assertAuthKeyForAny(st, clientKeys))
pool.Update(context.Background(), []NodeParam{
{1, "peer-1", 1},
{2, "peer0", 1},
{2, "peer1", 1},
})
st1, _ := pool.pool.Load().cache.Get(formCacheKey(cp.address(), pool.pool.Load().key, false))
require.Equal(t, &st1, &st)
cp2, err := pool.pool.Load().connection()
require.NoError(t, err)
require.Equal(t, cp2.address(), "peer-1")
}
func TestUpdateNodeMultithread(t *testing.T) {
key1 := newPrivateKey(t)
mockClientBuilder := func(addr string) client {
return newMockClient(addr, *key1)
}
opts := InitParameters{
key: newPrivateKey(t),
nodeParams: []NodeParam{{1, "peer0", 1}},
}
opts.setClientBuilder(mockClientBuilder)
pool, err := NewPool(opts)
require.NoError(t, err)
err = pool.Dial(context.Background())
require.NoError(t, err)
t.Cleanup(pool.Close)
wg := sync.WaitGroup{}
for i := 0; i < 10; i++ {
wg.Add(1)
go func(i int) {
defer wg.Done()
err := pool.Update(context.Background(), []NodeParam{{1, "peer" + strconv.Itoa(i+1), 1}})
require.NoError(t, err)
}(i)
}
wg.Wait()
}
func TestUpdateNodeEqualConfig(t *testing.T) {
key1 := newPrivateKey(t)
mockClientBuilder := func(addr string) client {
return newMockClient(addr, *key1)
}
opts := InitParameters{
key: newPrivateKey(t),
nodeParams: []NodeParam{{1, "peer0", 1}},
}
opts.setClientBuilder(mockClientBuilder)
pool, err := NewPool(opts)
require.NoError(t, err)
err = pool.Dial(context.Background())
require.NoError(t, err)
t.Cleanup(pool.Close)
cp, err := pool.pool.Load().connection()
require.NoError(t, err)
st, _ := pool.pool.Load().cache.Get(formCacheKey(cp.address(), pool.pool.Load().key, false))
expectedAuthKey := frostfsecdsa.PublicKey(key1.PublicKey)
require.True(t, st.AssertAuthKey(&expectedAuthKey))
_, flag, err := pool.pool.Load().update(context.Background(), []NodeParam{{1, "peer0", 1}})
require.NoError(t, err)
require.True(t, flag)
}
func TestUpdateNode(t *testing.T) {
key1 := newPrivateKey(t)
mockClientBuilder := func(addr string) client {
return newMockClient(addr, *key1)
}
opts := InitParameters{
key: newPrivateKey(t),
nodeParams: []NodeParam{{1, "peer0", 1}},
}
opts.setClientBuilder(mockClientBuilder)
pool, err := NewPool(opts)
require.NoError(t, err)
err = pool.Dial(context.Background())
require.NoError(t, err)
t.Cleanup(pool.Close)
cp, err := pool.pool.Load().connection()
require.NoError(t, err)
st, _ := pool.pool.Load().cache.Get(formCacheKey(cp.address(), pool.pool.Load().key, false))
expectedAuthKey := frostfsecdsa.PublicKey(key1.PublicKey)
require.True(t, st.AssertAuthKey(&expectedAuthKey))
pool.Update(context.Background(), []NodeParam{{1, "peer0", 1}})
cp1, err := pool.pool.Load().connection()
st1, _ := pool.pool.Load().cache.Get(formCacheKey(cp1.address(), pool.pool.Load().key, false))
require.NoError(t, err)
require.Equal(t, &st, &st1)
require.Equal(t, &cp, &cp1)
pool.Update(context.Background(), []NodeParam{{1, "peer1", 1}})
cp2, err := pool.pool.Load().connection()
require.NoError(t, err)
st2, _ := pool.pool.Load().cache.Get(formCacheKey(cp2.address(), pool.pool.Load().key, false))
require.NotEqual(t, cp.address(), cp2.address())
require.NotEqual(t, &st, &st2)
}
func assertAuthKeyForAny(st session.Object, clientKeys []*ecdsa.PrivateKey) bool {
for _, key := range clientKeys {
expectedAuthKey := frostfsecdsa.PublicKey(key.PublicKey)
@ -668,6 +814,25 @@ func TestHandleError(t *testing.T) {
}
}
func TestAdjustNodeParams(t *testing.T) {
nodes1 := []NodeParam{
{1, "peer0", 1},
{1, "peer1", 2},
{1, "peer2", 3},
{2, "peer21", 2},
}
nodes2 := []NodeParam{
{1, "peer0", 1},
{1, "peer2", 3},
{1, "peer1", 2},
{2, "peer21", 2},
}
nodesParam1, _ := adjustNodeParams(nodes1)
nodesParam2, _ := adjustNodeParams(nodes2)
require.True(t, reflect.DeepEqual(nodesParam1, nodesParam2))
}
func TestSwitchAfterErrorThreshold(t *testing.T) {
nodes := []NodeParam{
{1, "peer0", 1},