forked from TrueCloudLab/frostfs-node
[#1427] services/tree: Parallelize replicator
Before this commit the replication channel was quickly filled under heavy load. This lead to the continuously increasing latency for all write operations. Now it looks better. Signed-off-by: Evgenii Stratonikov <evgeniy@nspcc.ru>
This commit is contained in:
parent
8027b7bb6b
commit
33d8fb187a
3 changed files with 78 additions and 40 deletions
|
@ -3,6 +3,7 @@ package tree
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"sync"
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/hashicorp/golang-lru/simplelru"
|
"github.com/hashicorp/golang-lru/simplelru"
|
||||||
"github.com/nspcc-dev/neofs-node/pkg/network"
|
"github.com/nspcc-dev/neofs-node/pkg/network"
|
||||||
|
@ -15,7 +16,10 @@ type clientCache struct {
|
||||||
simplelru.LRU
|
simplelru.LRU
|
||||||
}
|
}
|
||||||
|
|
||||||
const defaultClientCacheSize = 10
|
const (
|
||||||
|
defaultClientCacheSize = 10
|
||||||
|
defaultClientConnectTimeout = time.Second * 2
|
||||||
|
)
|
||||||
|
|
||||||
func (c *clientCache) init() {
|
func (c *clientCache) init() {
|
||||||
l, _ := simplelru.NewLRU(defaultClientCacheSize, func(key, value interface{}) {
|
l, _ := simplelru.NewLRU(defaultClientCacheSize, func(key, value interface{}) {
|
||||||
|
@ -55,9 +59,11 @@ func dialTreeService(ctx context.Context, netmapAddr string) (*grpc.ClientConn,
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
cc, err := grpc.DialContext(ctx, netAddr.URIAddr(), grpc.WithInsecure())
|
ctx, cancel := context.WithTimeout(ctx, defaultClientConnectTimeout)
|
||||||
if err != nil {
|
cc, err := grpc.DialContext(ctx, netAddr.URIAddr(),
|
||||||
return nil, err
|
grpc.WithInsecure(),
|
||||||
}
|
grpc.WithBlock())
|
||||||
return cc, nil
|
cancel()
|
||||||
|
|
||||||
|
return cc, err
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,22 +21,70 @@ type movePair struct {
|
||||||
op *pilorama.LogMove
|
op *pilorama.LogMove
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type replicationTask struct {
|
||||||
|
n netmapSDK.NodeInfo
|
||||||
|
req *ApplyRequest
|
||||||
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
defaultReplicatorCapacity = 64
|
defaultReplicatorCapacity = 64
|
||||||
defaultReplicatorTimeout = time.Second * 2
|
defaultReplicatorWorkerCount = 64
|
||||||
|
defaultReplicatorSendTimeout = time.Second * 5
|
||||||
)
|
)
|
||||||
|
|
||||||
func (s *Service) replicateLoop(ctx context.Context) {
|
func (s *Service) replicationWorker() {
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-s.closeCh:
|
case <-s.closeCh:
|
||||||
|
return
|
||||||
|
case task := <-s.replicationTasks:
|
||||||
|
var lastErr error
|
||||||
|
var lastAddr string
|
||||||
|
|
||||||
|
task.n.IterateNetworkEndpoints(func(addr string) bool {
|
||||||
|
lastAddr = addr
|
||||||
|
|
||||||
|
c, err := s.cache.get(context.Background(), addr)
|
||||||
|
if err != nil {
|
||||||
|
lastErr = fmt.Errorf("can't create client: %w", err)
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), defaultReplicatorSendTimeout)
|
||||||
|
_, lastErr = c.Apply(ctx, task.req)
|
||||||
|
cancel()
|
||||||
|
|
||||||
|
return lastErr == nil
|
||||||
|
})
|
||||||
|
|
||||||
|
if lastErr != nil {
|
||||||
|
s.log.Warn("failed to sent update to the node",
|
||||||
|
zap.String("last_error", lastErr.Error()),
|
||||||
|
zap.String("address", lastAddr),
|
||||||
|
zap.String("key", hex.EncodeToString(task.n.PublicKey())))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Service) replicateLoop(ctx context.Context) {
|
||||||
|
for i := 0; i < defaultReplicatorWorkerCount; i++ {
|
||||||
|
go s.replicationWorker()
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
for len(s.replicationTasks) != 0 {
|
||||||
|
<-s.replicationTasks
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-s.closeCh:
|
||||||
|
return
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return
|
return
|
||||||
case op := <-s.replicateCh:
|
case op := <-s.replicateCh:
|
||||||
ctx, cancel := context.WithTimeout(ctx, defaultReplicatorTimeout)
|
err := s.replicate(op)
|
||||||
err := s.replicate(ctx, op)
|
|
||||||
cancel()
|
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.log.Error("error during replication",
|
s.log.Error("error during replication",
|
||||||
zap.String("err", err.Error()),
|
zap.String("err", err.Error()),
|
||||||
|
@ -47,7 +95,7 @@ func (s *Service) replicateLoop(ctx context.Context) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Service) replicate(ctx context.Context, op movePair) error {
|
func (s *Service) replicate(op movePair) error {
|
||||||
req := newApplyRequest(&op)
|
req := newApplyRequest(&op)
|
||||||
err := signMessage(req, s.key)
|
err := signMessage(req, s.key)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -64,37 +112,19 @@ func (s *Service) replicate(ctx context.Context, op movePair) error {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
var lastErr error
|
s.replicationTasks <- replicationTask{n, req}
|
||||||
var lastAddr string
|
|
||||||
|
|
||||||
n.IterateNetworkEndpoints(func(addr string) bool {
|
|
||||||
lastAddr = addr
|
|
||||||
|
|
||||||
c, err := s.cache.get(ctx, addr)
|
|
||||||
if err != nil {
|
|
||||||
lastErr = err
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
_, lastErr = c.Apply(ctx, req)
|
|
||||||
return lastErr == nil
|
|
||||||
})
|
|
||||||
|
|
||||||
if lastErr != nil {
|
|
||||||
s.log.Warn("failed to sent update to the node",
|
|
||||||
zap.String("last_error", lastErr.Error()),
|
|
||||||
zap.String("address", lastAddr),
|
|
||||||
zap.String("key", hex.EncodeToString(n.PublicKey())))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Service) pushToQueue(cid cidSDK.ID, treeID string, op *pilorama.LogMove) {
|
func (s *Service) pushToQueue(cid cidSDK.ID, treeID string, op *pilorama.LogMove) {
|
||||||
s.replicateCh <- movePair{
|
select {
|
||||||
|
case s.replicateCh <- movePair{
|
||||||
cid: cid,
|
cid: cid,
|
||||||
treeID: treeID,
|
treeID: treeID,
|
||||||
op: op,
|
op: op,
|
||||||
|
}:
|
||||||
|
case <-s.closeCh:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,9 +18,10 @@ import (
|
||||||
type Service struct {
|
type Service struct {
|
||||||
cfg
|
cfg
|
||||||
|
|
||||||
cache clientCache
|
cache clientCache
|
||||||
replicateCh chan movePair
|
replicateCh chan movePair
|
||||||
closeCh chan struct{}
|
replicationTasks chan replicationTask
|
||||||
|
closeCh chan struct{}
|
||||||
}
|
}
|
||||||
|
|
||||||
// MaxGetSubTreeDepth represents maximum allowed traversal depth in GetSubTree RPC.
|
// MaxGetSubTreeDepth represents maximum allowed traversal depth in GetSubTree RPC.
|
||||||
|
@ -42,6 +43,7 @@ func New(opts ...Option) *Service {
|
||||||
s.cache.init()
|
s.cache.init()
|
||||||
s.closeCh = make(chan struct{})
|
s.closeCh = make(chan struct{})
|
||||||
s.replicateCh = make(chan movePair, defaultReplicatorCapacity)
|
s.replicateCh = make(chan movePair, defaultReplicatorCapacity)
|
||||||
|
s.replicationTasks = make(chan replicationTask, defaultReplicatorWorkerCount)
|
||||||
|
|
||||||
return &s
|
return &s
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue