forked from TrueCloudLab/frostfs-node
[#1427] services/tree: Parallelize replicator
Before this commit the replication channel was quickly filled under heavy load. This lead to the continuously increasing latency for all write operations. Now it looks better. Signed-off-by: Evgenii Stratonikov <evgeniy@nspcc.ru>
This commit is contained in:
parent
8027b7bb6b
commit
33d8fb187a
3 changed files with 78 additions and 40 deletions
|
@ -21,22 +21,70 @@ type movePair struct {
|
|||
op *pilorama.LogMove
|
||||
}
|
||||
|
||||
type replicationTask struct {
|
||||
n netmapSDK.NodeInfo
|
||||
req *ApplyRequest
|
||||
}
|
||||
|
||||
const (
|
||||
defaultReplicatorCapacity = 64
|
||||
defaultReplicatorTimeout = time.Second * 2
|
||||
defaultReplicatorCapacity = 64
|
||||
defaultReplicatorWorkerCount = 64
|
||||
defaultReplicatorSendTimeout = time.Second * 5
|
||||
)
|
||||
|
||||
func (s *Service) replicateLoop(ctx context.Context) {
|
||||
func (s *Service) replicationWorker() {
|
||||
for {
|
||||
select {
|
||||
case <-s.closeCh:
|
||||
return
|
||||
case task := <-s.replicationTasks:
|
||||
var lastErr error
|
||||
var lastAddr string
|
||||
|
||||
task.n.IterateNetworkEndpoints(func(addr string) bool {
|
||||
lastAddr = addr
|
||||
|
||||
c, err := s.cache.get(context.Background(), addr)
|
||||
if err != nil {
|
||||
lastErr = fmt.Errorf("can't create client: %w", err)
|
||||
return false
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), defaultReplicatorSendTimeout)
|
||||
_, lastErr = c.Apply(ctx, task.req)
|
||||
cancel()
|
||||
|
||||
return lastErr == nil
|
||||
})
|
||||
|
||||
if lastErr != nil {
|
||||
s.log.Warn("failed to sent update to the node",
|
||||
zap.String("last_error", lastErr.Error()),
|
||||
zap.String("address", lastAddr),
|
||||
zap.String("key", hex.EncodeToString(task.n.PublicKey())))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Service) replicateLoop(ctx context.Context) {
|
||||
for i := 0; i < defaultReplicatorWorkerCount; i++ {
|
||||
go s.replicationWorker()
|
||||
}
|
||||
defer func() {
|
||||
for len(s.replicationTasks) != 0 {
|
||||
<-s.replicationTasks
|
||||
}
|
||||
}()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-s.closeCh:
|
||||
return
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case op := <-s.replicateCh:
|
||||
ctx, cancel := context.WithTimeout(ctx, defaultReplicatorTimeout)
|
||||
err := s.replicate(ctx, op)
|
||||
cancel()
|
||||
|
||||
err := s.replicate(op)
|
||||
if err != nil {
|
||||
s.log.Error("error during replication",
|
||||
zap.String("err", err.Error()),
|
||||
|
@ -47,7 +95,7 @@ func (s *Service) replicateLoop(ctx context.Context) {
|
|||
}
|
||||
}
|
||||
|
||||
func (s *Service) replicate(ctx context.Context, op movePair) error {
|
||||
func (s *Service) replicate(op movePair) error {
|
||||
req := newApplyRequest(&op)
|
||||
err := signMessage(req, s.key)
|
||||
if err != nil {
|
||||
|
@ -64,37 +112,19 @@ func (s *Service) replicate(ctx context.Context, op movePair) error {
|
|||
continue
|
||||
}
|
||||
|
||||
var lastErr error
|
||||
var lastAddr string
|
||||
|
||||
n.IterateNetworkEndpoints(func(addr string) bool {
|
||||
lastAddr = addr
|
||||
|
||||
c, err := s.cache.get(ctx, addr)
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
return false
|
||||
}
|
||||
|
||||
_, lastErr = c.Apply(ctx, req)
|
||||
return lastErr == nil
|
||||
})
|
||||
|
||||
if lastErr != nil {
|
||||
s.log.Warn("failed to sent update to the node",
|
||||
zap.String("last_error", lastErr.Error()),
|
||||
zap.String("address", lastAddr),
|
||||
zap.String("key", hex.EncodeToString(n.PublicKey())))
|
||||
}
|
||||
s.replicationTasks <- replicationTask{n, req}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Service) pushToQueue(cid cidSDK.ID, treeID string, op *pilorama.LogMove) {
|
||||
s.replicateCh <- movePair{
|
||||
select {
|
||||
case s.replicateCh <- movePair{
|
||||
cid: cid,
|
||||
treeID: treeID,
|
||||
op: op,
|
||||
}:
|
||||
case <-s.closeCh:
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue