[#947] engine: Evacuate trees to remote nodes

Signed-off-by: Dmitrii Stepanov <d.stepanov@yadro.com>
This commit is contained in:
Dmitrii Stepanov 2024-02-06 17:34:32 +03:00
parent 728150d1d2
commit db67c21d55
8 changed files with 283 additions and 95 deletions

View file

@ -71,61 +71,67 @@ func (s *Service) replicationWorker(ctx context.Context) {
case <-s.closeCh:
return
case task := <-s.replicationTasks:
ctx, span := tracing.StartSpanFromContext(ctx, "TreeService.HandleReplicationTask",
trace.WithAttributes(
attribute.String("public_key", hex.EncodeToString(task.n.PublicKey())),
),
)
start := time.Now()
var lastErr error
var lastAddr string
task.n.IterateNetworkEndpoints(func(addr string) bool {
ctx, span := tracing.StartSpanFromContext(ctx, "TreeService.HandleReplicationTaskOnEndpoint",
trace.WithAttributes(
attribute.String("public_key", hex.EncodeToString(task.n.PublicKey())),
attribute.String("address", addr),
),
)
defer span.End()
lastAddr = addr
c, err := s.cache.get(ctx, addr)
if err != nil {
lastErr = fmt.Errorf("can't create client: %w", err)
return false
}
ctx, cancel := context.WithTimeout(ctx, s.replicatorTimeout)
_, lastErr = c.Apply(ctx, task.req)
cancel()
return lastErr == nil
})
if lastErr != nil {
if errors.Is(lastErr, errRecentlyFailed) {
s.log.Debug(logs.TreeDoNotSendUpdateToTheNode,
zap.String("last_error", lastErr.Error()),
zap.String("trace_id", tracingPkg.GetTraceID(ctx)))
} else {
s.log.Warn(logs.TreeFailedToSentUpdateToTheNode,
zap.String("last_error", lastErr.Error()),
zap.String("address", lastAddr),
zap.String("key", hex.EncodeToString(task.n.PublicKey())),
zap.String("trace_id", tracingPkg.GetTraceID(ctx)))
}
s.metrics.AddReplicateTaskDuration(time.Since(start), false)
} else {
s.metrics.AddReplicateTaskDuration(time.Since(start), true)
}
span.End()
_ = s.ReplicateTreeOp(ctx, task.n, task.req)
}
}
}
func (s *Service) ReplicateTreeOp(ctx context.Context, n netmapSDK.NodeInfo, req *ApplyRequest) error {
ctx, span := tracing.StartSpanFromContext(ctx, "TreeService.HandleReplicationTask",
trace.WithAttributes(
attribute.String("public_key", hex.EncodeToString(n.PublicKey())),
),
)
defer span.End()
start := time.Now()
var lastErr error
var lastAddr string
n.IterateNetworkEndpoints(func(addr string) bool {
ctx, span := tracing.StartSpanFromContext(ctx, "TreeService.HandleReplicationTaskOnEndpoint",
trace.WithAttributes(
attribute.String("public_key", hex.EncodeToString(n.PublicKey())),
attribute.String("address", addr),
),
)
defer span.End()
lastAddr = addr
c, err := s.cache.get(ctx, addr)
if err != nil {
lastErr = fmt.Errorf("can't create client: %w", err)
return false
}
ctx, cancel := context.WithTimeout(ctx, s.replicatorTimeout)
_, lastErr = c.Apply(ctx, req)
cancel()
return lastErr == nil
})
if lastErr != nil {
if errors.Is(lastErr, errRecentlyFailed) {
s.log.Debug(logs.TreeDoNotSendUpdateToTheNode,
zap.String("last_error", lastErr.Error()),
zap.String("trace_id", tracingPkg.GetTraceID(ctx)))
} else {
s.log.Warn(logs.TreeFailedToSentUpdateToTheNode,
zap.String("last_error", lastErr.Error()),
zap.String("address", lastAddr),
zap.String("key", hex.EncodeToString(n.PublicKey())),
zap.String("trace_id", tracingPkg.GetTraceID(ctx)))
}
s.metrics.AddReplicateTaskDuration(time.Since(start), false)
return lastErr
}
s.metrics.AddReplicateTaskDuration(time.Since(start), true)
return nil
}
func (s *Service) replicateLoop(ctx context.Context) {
for i := 0; i < s.replicatorWorkerCount; i++ {
go s.replicationWorker(ctx)