[#947] engine: Evacuate trees to remote nodes
Signed-off-by: Dmitrii Stepanov <d.stepanov@yadro.com>
This commit is contained in:
parent
728150d1d2
commit
db67c21d55
8 changed files with 283 additions and 95 deletions
|
@ -71,61 +71,67 @@ func (s *Service) replicationWorker(ctx context.Context) {
|
|||
case <-s.closeCh:
|
||||
return
|
||||
case task := <-s.replicationTasks:
|
||||
ctx, span := tracing.StartSpanFromContext(ctx, "TreeService.HandleReplicationTask",
|
||||
trace.WithAttributes(
|
||||
attribute.String("public_key", hex.EncodeToString(task.n.PublicKey())),
|
||||
),
|
||||
)
|
||||
start := time.Now()
|
||||
|
||||
var lastErr error
|
||||
var lastAddr string
|
||||
|
||||
task.n.IterateNetworkEndpoints(func(addr string) bool {
|
||||
ctx, span := tracing.StartSpanFromContext(ctx, "TreeService.HandleReplicationTaskOnEndpoint",
|
||||
trace.WithAttributes(
|
||||
attribute.String("public_key", hex.EncodeToString(task.n.PublicKey())),
|
||||
attribute.String("address", addr),
|
||||
),
|
||||
)
|
||||
defer span.End()
|
||||
|
||||
lastAddr = addr
|
||||
|
||||
c, err := s.cache.get(ctx, addr)
|
||||
if err != nil {
|
||||
lastErr = fmt.Errorf("can't create client: %w", err)
|
||||
return false
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(ctx, s.replicatorTimeout)
|
||||
_, lastErr = c.Apply(ctx, task.req)
|
||||
cancel()
|
||||
|
||||
return lastErr == nil
|
||||
})
|
||||
|
||||
if lastErr != nil {
|
||||
if errors.Is(lastErr, errRecentlyFailed) {
|
||||
s.log.Debug(logs.TreeDoNotSendUpdateToTheNode,
|
||||
zap.String("last_error", lastErr.Error()),
|
||||
zap.String("trace_id", tracingPkg.GetTraceID(ctx)))
|
||||
} else {
|
||||
s.log.Warn(logs.TreeFailedToSentUpdateToTheNode,
|
||||
zap.String("last_error", lastErr.Error()),
|
||||
zap.String("address", lastAddr),
|
||||
zap.String("key", hex.EncodeToString(task.n.PublicKey())),
|
||||
zap.String("trace_id", tracingPkg.GetTraceID(ctx)))
|
||||
}
|
||||
s.metrics.AddReplicateTaskDuration(time.Since(start), false)
|
||||
} else {
|
||||
s.metrics.AddReplicateTaskDuration(time.Since(start), true)
|
||||
}
|
||||
span.End()
|
||||
_ = s.ReplicateTreeOp(ctx, task.n, task.req)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Service) ReplicateTreeOp(ctx context.Context, n netmapSDK.NodeInfo, req *ApplyRequest) error {
|
||||
ctx, span := tracing.StartSpanFromContext(ctx, "TreeService.HandleReplicationTask",
|
||||
trace.WithAttributes(
|
||||
attribute.String("public_key", hex.EncodeToString(n.PublicKey())),
|
||||
),
|
||||
)
|
||||
defer span.End()
|
||||
|
||||
start := time.Now()
|
||||
|
||||
var lastErr error
|
||||
var lastAddr string
|
||||
|
||||
n.IterateNetworkEndpoints(func(addr string) bool {
|
||||
ctx, span := tracing.StartSpanFromContext(ctx, "TreeService.HandleReplicationTaskOnEndpoint",
|
||||
trace.WithAttributes(
|
||||
attribute.String("public_key", hex.EncodeToString(n.PublicKey())),
|
||||
attribute.String("address", addr),
|
||||
),
|
||||
)
|
||||
defer span.End()
|
||||
|
||||
lastAddr = addr
|
||||
|
||||
c, err := s.cache.get(ctx, addr)
|
||||
if err != nil {
|
||||
lastErr = fmt.Errorf("can't create client: %w", err)
|
||||
return false
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(ctx, s.replicatorTimeout)
|
||||
_, lastErr = c.Apply(ctx, req)
|
||||
cancel()
|
||||
|
||||
return lastErr == nil
|
||||
})
|
||||
|
||||
if lastErr != nil {
|
||||
if errors.Is(lastErr, errRecentlyFailed) {
|
||||
s.log.Debug(logs.TreeDoNotSendUpdateToTheNode,
|
||||
zap.String("last_error", lastErr.Error()),
|
||||
zap.String("trace_id", tracingPkg.GetTraceID(ctx)))
|
||||
} else {
|
||||
s.log.Warn(logs.TreeFailedToSentUpdateToTheNode,
|
||||
zap.String("last_error", lastErr.Error()),
|
||||
zap.String("address", lastAddr),
|
||||
zap.String("key", hex.EncodeToString(n.PublicKey())),
|
||||
zap.String("trace_id", tracingPkg.GetTraceID(ctx)))
|
||||
}
|
||||
s.metrics.AddReplicateTaskDuration(time.Since(start), false)
|
||||
return lastErr
|
||||
}
|
||||
s.metrics.AddReplicateTaskDuration(time.Since(start), true)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Service) replicateLoop(ctx context.Context) {
|
||||
for i := 0; i < s.replicatorWorkerCount; i++ {
|
||||
go s.replicationWorker(ctx)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue