diff --git a/internal/logs/logs.go b/internal/logs/logs.go index e8472357c..a72609b41 100644 --- a/internal/logs/logs.go +++ b/internal/logs/logs.go @@ -64,6 +64,9 @@ const ( TreeCouldNotUpdateLastSynchronizedHeightForATree = "could not update last synchronized height for a tree" TreeSynchronizeTree = "synchronize tree" TreeFailedToRunTreeSynchronizationOverAllNodes = "failed to run tree synchronization over all nodes" + TreeFailedToRunTreeSynchronizationForSpecificNode = "failed to run tree synchronization for specific node" + TreeFailedToParseAddressForTreeSynchronization = "failed to parse address for tree synchronization" + TreeFailedToConnectForTreeSynchronization = "failed to connect for tree synchronization" TreeSyncingTrees = "syncing trees..." TreeCouldNotFetchContainers = "could not fetch containers" TreeTreesHaveBeenSynchronized = "trees have been synchronized" diff --git a/pkg/services/tree/sync.go b/pkg/services/tree/sync.go index f5a7fbce6..485278adf 100644 --- a/pkg/services/tree/sync.go +++ b/pkg/services/tree/sync.go @@ -9,6 +9,7 @@ import ( "math" "math/rand" "sync" + "sync/atomic" "time" "git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs" @@ -216,45 +217,44 @@ func (s *Service) applyOperationStream(ctx context.Context, cid cid.ID, treeID s } func (s *Service) startStream(ctx context.Context, cid cid.ID, treeID string, - height uint64, treeClient TreeServiceClient, opsCh chan<- *pilorama.Move, -) (uint64, error) { + height uint64, cc *grpc.ClientConn, opsCh chan<- *pilorama.Move, +) error { + treeClient := NewTreeServiceClient(cc) + rawCID := make([]byte, sha256.Size) cid.Encode(rawCID) - for { - newHeight := height - req := &GetOpLogRequest{ - Body: &GetOpLogRequest_Body{ - ContainerId: rawCID, - TreeId: treeID, - Height: newHeight, - }, - } - if err := SignMessage(req, s.key); err != nil { - return 0, err - } - - c, err := treeClient.GetOpLog(ctx, req) - if err != nil { - return 0, fmt.Errorf("can't initialize client: %w", err) - } - res, err := c.Recv() - for ; err == nil; res, err = c.Recv() { - lm := res.GetBody().GetOperation() - m := &pilorama.Move{ - Parent: lm.ParentId, - Child: lm.ChildId, - } - if err := m.Meta.FromBytes(lm.Meta); err != nil { - return 0, err - } - opsCh <- m - } - if height == newHeight || err != nil && !errors.Is(err, io.EOF) { - return newHeight, err - } - height = newHeight + req := &GetOpLogRequest{ + Body: &GetOpLogRequest_Body{ + ContainerId: rawCID, + TreeId: treeID, + Height: height, + }, } + if err := SignMessage(req, s.key); err != nil { + return err + } + + c, err := treeClient.GetOpLog(ctx, req) + if err != nil { + return fmt.Errorf("can't initialize client: %w", err) + } + res, err := c.Recv() + for ; err == nil; res, err = c.Recv() { + lm := res.GetBody().GetOperation() + m := &pilorama.Move{ + Parent: lm.ParentId, + Child: lm.ChildId, + } + if err := m.Meta.FromBytes(lm.Meta); err != nil { + return err + } + opsCh <- m + } + if err != nil && !errors.Is(err, io.EOF) { + return err + } + return nil } // synchronizeTree synchronizes operations getting them from different nodes. @@ -287,50 +287,44 @@ func (s *Service) synchronizeTree(ctx context.Context, cid cid.ID, from uint64, return nil }) + var allNodesSynced atomic.Bool + allNodesSynced.Store(true) + for i, n := range nodes { i := i n := n errGroup.Go(func() error { - height := from + var nodeSynced bool n.IterateNetworkEndpoints(func(addr string) bool { var a network.Address if err := a.FromString(addr); err != nil { + s.log.Warn(logs.TreeFailedToParseAddressForTreeSynchronization, zap.Error(err), zap.String("address", addr)) return false } - cc, err := grpc.DialContext(egCtx, a.URIAddr(), - grpc.WithChainUnaryInterceptor( - metrics.NewUnaryClientInterceptor(), - tracing_grpc.NewUnaryClientInteceptor(), - ), - grpc.WithChainStreamInterceptor( - metrics.NewStreamClientInterceptor(), - tracing_grpc.NewStreamClientInterceptor(), - ), - grpc.WithTransportCredentials(insecure.NewCredentials())) + cc, err := s.dialCtx(egCtx, a) if err != nil { - // Failed to connect, try the next address. + s.log.Warn(logs.TreeFailedToConnectForTreeSynchronization, zap.Error(err), zap.String("address", addr)) return false } defer cc.Close() - treeClient := NewTreeServiceClient(cc) - for { - h, err := s.startStream(egCtx, cid, treeID, from, treeClient, nodeOperationStreams[i]) - if height < h { - height = h - } - if err != nil || h <= height { - // Error with the response, try the next node. - return true - } + err = s.startStream(egCtx, cid, treeID, from, cc, nodeOperationStreams[i]) + if err != nil { + s.log.Warn(logs.TreeFailedToRunTreeSynchronizationForSpecificNode, zap.Error(err), zap.String("address", addr)) } + nodeSynced = err == nil + return true }) close(nodeOperationStreams[i]) + if !nodeSynced { + allNodesSynced.Store(false) + } return nil }) } if err := errGroup.Wait(); err != nil { + allNodesSynced.Store(false) s.log.Warn(logs.TreeFailedToRunTreeSynchronizationOverAllNodes, zap.Error(err)) } @@ -340,7 +334,23 @@ func (s *Service) synchronizeTree(ctx context.Context, cid cid.ID, from uint64, } else { newHeight++ } - return newHeight + if allNodesSynced.Load() { + return newHeight + } + return from +} + +func (*Service) dialCtx(egCtx context.Context, a network.Address) (*grpc.ClientConn, error) { + return grpc.DialContext(egCtx, a.URIAddr(), + grpc.WithChainUnaryInterceptor( + metrics.NewUnaryClientInterceptor(), + tracing_grpc.NewUnaryClientInteceptor(), + ), + grpc.WithChainStreamInterceptor( + metrics.NewStreamClientInterceptor(), + tracing_grpc.NewStreamClientInterceptor(), + ), + grpc.WithTransportCredentials(insecure.NewCredentials())) } // ErrAlreadySyncing is returned when a service synchronization has already