Fix tree service sync #829
2 changed files with 71 additions and 58 deletions
|
@ -64,6 +64,9 @@ const (
|
|||
TreeCouldNotUpdateLastSynchronizedHeightForATree = "could not update last synchronized height for a tree"
|
||||
TreeSynchronizeTree = "synchronize tree"
|
||||
TreeFailedToRunTreeSynchronizationOverAllNodes = "failed to run tree synchronization over all nodes"
|
||||
TreeFailedToRunTreeSynchronizationForSpecificNode = "failed to run tree synchronization for specific node"
|
||||
TreeFailedToParseAddressForTreeSynchronization = "failed to parse address for tree synchronization"
|
||||
TreeFailedToConnectForTreeSynchronization = "failed to connect for tree synchronization"
|
||||
TreeSyncingTrees = "syncing trees..."
|
||||
TreeCouldNotFetchContainers = "could not fetch containers"
|
||||
TreeTreesHaveBeenSynchronized = "trees have been synchronized"
|
||||
|
|
|
@ -9,6 +9,7 @@ import (
|
|||
"math"
|
||||
"math/rand"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs"
|
||||
|
@ -216,45 +217,44 @@ func (s *Service) applyOperationStream(ctx context.Context, cid cid.ID, treeID s
|
|||
}
|
||||
|
||||
func (s *Service) startStream(ctx context.Context, cid cid.ID, treeID string,
|
||||
height uint64, treeClient TreeServiceClient, opsCh chan<- *pilorama.Move,
|
||||
) (uint64, error) {
|
||||
height uint64, cc *grpc.ClientConn, opsCh chan<- *pilorama.Move,
|
||||
) error {
|
||||
treeClient := NewTreeServiceClient(cc)
|
||||
|
||||
rawCID := make([]byte, sha256.Size)
|
||||
cid.Encode(rawCID)
|
||||
|
||||
for {
|
||||
newHeight := height
|
||||
req := &GetOpLogRequest{
|
||||
Body: &GetOpLogRequest_Body{
|
||||
ContainerId: rawCID,
|
||||
TreeId: treeID,
|
||||
Height: newHeight,
|
||||
},
|
||||
}
|
||||
if err := SignMessage(req, s.key); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
c, err := treeClient.GetOpLog(ctx, req)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("can't initialize client: %w", err)
|
||||
}
|
||||
res, err := c.Recv()
|
||||
for ; err == nil; res, err = c.Recv() {
|
||||
lm := res.GetBody().GetOperation()
|
||||
m := &pilorama.Move{
|
||||
Parent: lm.ParentId,
|
||||
Child: lm.ChildId,
|
||||
}
|
||||
if err := m.Meta.FromBytes(lm.Meta); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
opsCh <- m
|
||||
}
|
||||
if height == newHeight || err != nil && !errors.Is(err, io.EOF) {
|
||||
return newHeight, err
|
||||
}
|
||||
height = newHeight
|
||||
req := &GetOpLogRequest{
|
||||
Body: &GetOpLogRequest_Body{
|
||||
ContainerId: rawCID,
|
||||
TreeId: treeID,
|
||||
Height: height,
|
||||
},
|
||||
}
|
||||
if err := SignMessage(req, s.key); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
c, err := treeClient.GetOpLog(ctx, req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("can't initialize client: %w", err)
|
||||
}
|
||||
res, err := c.Recv()
|
||||
for ; err == nil; res, err = c.Recv() {
|
||||
lm := res.GetBody().GetOperation()
|
||||
m := &pilorama.Move{
|
||||
Parent: lm.ParentId,
|
||||
Child: lm.ChildId,
|
||||
}
|
||||
if err := m.Meta.FromBytes(lm.Meta); err != nil {
|
||||
return err
|
||||
}
|
||||
opsCh <- m
|
||||
}
|
||||
if err != nil && !errors.Is(err, io.EOF) {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// synchronizeTree synchronizes operations getting them from different nodes.
|
||||
|
@ -287,50 +287,44 @@ func (s *Service) synchronizeTree(ctx context.Context, cid cid.ID, from uint64,
|
|||
return nil
|
||||
})
|
||||
|
||||
var allNodesSynced atomic.Bool
|
||||
allNodesSynced.Store(true)
|
||||
|
||||
for i, n := range nodes {
|
||||
i := i
|
||||
n := n
|
||||
errGroup.Go(func() error {
|
||||
height := from
|
||||
var nodeSynced bool
|
||||
n.IterateNetworkEndpoints(func(addr string) bool {
|
||||
var a network.Address
|
||||
if err := a.FromString(addr); err != nil {
|
||||
s.log.Warn(logs.TreeFailedToParseAddressForTreeSynchronization, zap.Error(err), zap.String("address", addr))
|
||||
return false
|
||||
}
|
||||
|
||||
cc, err := grpc.DialContext(egCtx, a.URIAddr(),
|
||||
grpc.WithChainUnaryInterceptor(
|
||||
metrics.NewUnaryClientInterceptor(),
|
||||
tracing_grpc.NewUnaryClientInteceptor(),
|
||||
),
|
||||
grpc.WithChainStreamInterceptor(
|
||||
metrics.NewStreamClientInterceptor(),
|
||||
tracing_grpc.NewStreamClientInterceptor(),
|
||||
),
|
||||
grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
cc, err := s.dialCtx(egCtx, a)
|
||||
if err != nil {
|
||||
// Failed to connect, try the next address.
|
||||
s.log.Warn(logs.TreeFailedToConnectForTreeSynchronization, zap.Error(err), zap.String("address", addr))
|
||||
return false
|
||||
}
|
||||
defer cc.Close()
|
||||
|
||||
treeClient := NewTreeServiceClient(cc)
|
||||
for {
|
||||
h, err := s.startStream(egCtx, cid, treeID, from, treeClient, nodeOperationStreams[i])
|
||||
if height < h {
|
||||
height = h
|
||||
}
|
||||
if err != nil || h <= height {
|
||||
// Error with the response, try the next node.
|
||||
return true
|
||||
}
|
||||
err = s.startStream(egCtx, cid, treeID, from, cc, nodeOperationStreams[i])
|
||||
if err != nil {
|
||||
s.log.Warn(logs.TreeFailedToRunTreeSynchronizationForSpecificNode, zap.Error(err), zap.String("address", addr))
|
||||
}
|
||||
nodeSynced = err == nil
|
||||
return true
|
||||
})
|
||||
close(nodeOperationStreams[i])
|
||||
if !nodeSynced {
|
||||
allNodesSynced.Store(false)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
}
|
||||
if err := errGroup.Wait(); err != nil {
|
||||
allNodesSynced.Store(false)
|
||||
s.log.Warn(logs.TreeFailedToRunTreeSynchronizationOverAllNodes, zap.Error(err))
|
||||
}
|
||||
|
||||
|
@ -340,7 +334,23 @@ func (s *Service) synchronizeTree(ctx context.Context, cid cid.ID, from uint64,
|
|||
} else {
|
||||
newHeight++
|
||||
}
|
||||
return newHeight
|
||||
if allNodesSynced.Load() {
|
||||
return newHeight
|
||||
}
|
||||
return from
|
||||
}
|
||||
|
||||
func (*Service) dialCtx(egCtx context.Context, a network.Address) (*grpc.ClientConn, error) {
|
||||
return grpc.DialContext(egCtx, a.URIAddr(),
|
||||
grpc.WithChainUnaryInterceptor(
|
||||
metrics.NewUnaryClientInterceptor(),
|
||||
tracing_grpc.NewUnaryClientInteceptor(),
|
||||
),
|
||||
grpc.WithChainStreamInterceptor(
|
||||
metrics.NewStreamClientInterceptor(),
|
||||
tracing_grpc.NewStreamClientInterceptor(),
|
||||
),
|
||||
grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
}
|
||||
|
||||
// ErrAlreadySyncing is returned when a service synchronization has already
|
||||
|
|
Loading…
Reference in a new issue