forked from TrueCloudLab/frostfs-node
[#166] node: Parallelize background tree service sync by batching
* Merge operations Signed-off-by: Airat Arifullin a.arifullin@yadro.com
This commit is contained in:
parent
299b24b974
commit
9d01029733
2 changed files with 240 additions and 80 deletions
|
@ -125,6 +125,137 @@ func (s *Service) SynchronizeTree(ctx context.Context, cid cid.ID, treeID string
|
|||
return nil
|
||||
}
|
||||
|
||||
// mergeOperationStreams performs merge sort for node operation streams to one stream.
|
||||
func mergeOperationStreams(ctx context.Context, streams []chan *pilorama.Move, merged chan<- *pilorama.Move) uint64 {
|
||||
defer close(merged)
|
||||
|
||||
ms := make([]*pilorama.Move, len(streams))
|
||||
for i := range streams {
|
||||
ms[i] = <-streams[i]
|
||||
}
|
||||
|
||||
// Merging different node streams shuffles incoming operations like that:
|
||||
//
|
||||
// x - operation from the stream A
|
||||
// o - operation from the stream B
|
||||
//
|
||||
// --o---o--x--x--x--o---x--x------> t
|
||||
// ^
|
||||
// If all ops have been successfully applied, we must start from the last
|
||||
// operation height from the stream B. This height is stored in minStreamedLastHeight.
|
||||
var minStreamedLastHeight uint64 = math.MaxUint64
|
||||
|
||||
for {
|
||||
var minTimeMoveTime uint64 = math.MaxUint64
|
||||
minTimeMoveIndex := -1
|
||||
for i, m := range ms {
|
||||
if m != nil && minTimeMoveTime > m.Time {
|
||||
minTimeMoveTime = m.Time
|
||||
minTimeMoveIndex = i
|
||||
}
|
||||
}
|
||||
|
||||
if minTimeMoveIndex == -1 {
|
||||
break
|
||||
}
|
||||
|
||||
merged <- ms[minTimeMoveIndex]
|
||||
height := ms[minTimeMoveIndex].Time
|
||||
if ms[minTimeMoveIndex] = <-streams[minTimeMoveIndex]; ms[minTimeMoveIndex] == nil {
|
||||
if minStreamedLastHeight > height {
|
||||
minStreamedLastHeight = height
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return minStreamedLastHeight
|
||||
}
|
||||
|
||||
func (s *Service) applyOperationStream(ctx context.Context, cid cid.ID, treeID string,
|
||||
operationStream <-chan *pilorama.Move) uint64 {
|
||||
errGroup, _ := errgroup.WithContext(ctx)
|
||||
const workersCount = 1024
|
||||
errGroup.SetLimit(workersCount)
|
||||
|
||||
// We run TreeApply concurrently for the operation batch. Let's consider two operations
|
||||
// in the batch m1 and m2 such that m1.Time < m2.Time. The engine may apply m2 and fail
|
||||
// on m1. That means the service must start sync from m1.Time in the next iteration and
|
||||
// this height is stored in unappliedOperationHeight.
|
||||
var unappliedOperationHeight uint64 = math.MaxUint64
|
||||
var heightMtx sync.Mutex
|
||||
|
||||
var prev *pilorama.Move
|
||||
for m := range operationStream {
|
||||
m := m
|
||||
|
||||
// skip already applied op
|
||||
if prev != nil && prev.Time == m.Time {
|
||||
continue
|
||||
}
|
||||
prev = m
|
||||
|
||||
errGroup.Go(func() error {
|
||||
if err := s.forest.TreeApply(ctx, cid, treeID, m, true); err != nil {
|
||||
heightMtx.Lock()
|
||||
if m.Time < unappliedOperationHeight {
|
||||
unappliedOperationHeight = m.Time
|
||||
}
|
||||
heightMtx.Unlock()
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
})
|
||||
}
|
||||
_ = errGroup.Wait()
|
||||
return unappliedOperationHeight
|
||||
}
|
||||
|
||||
func (s *Service) startStream(ctx context.Context, cid cid.ID, treeID string,
|
||||
height uint64, treeClient TreeServiceClient, opsCh chan<- *pilorama.Move) (uint64, error) {
|
||||
rawCID := make([]byte, sha256.Size)
|
||||
cid.Encode(rawCID)
|
||||
|
||||
for {
|
||||
newHeight := height
|
||||
req := &GetOpLogRequest{
|
||||
Body: &GetOpLogRequest_Body{
|
||||
ContainerId: rawCID,
|
||||
TreeId: treeID,
|
||||
Height: newHeight,
|
||||
},
|
||||
}
|
||||
if err := SignMessage(req, s.key); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
c, err := treeClient.GetOpLog(ctx, req)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("can't initialize client: %w", err)
|
||||
}
|
||||
res, err := c.Recv()
|
||||
for ; err == nil; res, err = c.Recv() {
|
||||
lm := res.GetBody().GetOperation()
|
||||
m := &pilorama.Move{
|
||||
Parent: lm.ParentId,
|
||||
Child: lm.ChildId,
|
||||
}
|
||||
if err := m.Meta.FromBytes(lm.Meta); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
opsCh <- m
|
||||
}
|
||||
if height == newHeight || err != nil && !errors.Is(err, io.EOF) {
|
||||
return newHeight, err
|
||||
}
|
||||
height = newHeight
|
||||
}
|
||||
}
|
||||
|
||||
// synchronizeTree synchronizes operations getting them from different nodes.
|
||||
// Each available node does stream operations to a separate stream. These streams
|
||||
// are merged into one big stream ordered by operation time. This way allows to skip
|
||||
// already applied operation and keep good batching.
|
||||
// The method returns a height that service should start sync from in the next time.
|
||||
func (s *Service) synchronizeTree(ctx context.Context, cid cid.ID, from uint64,
|
||||
treeID string, nodes []netmapSDK.NodeInfo) uint64 {
|
||||
s.log.Debug(logs.TreeSynchronizeTree,
|
||||
|
@ -133,10 +264,25 @@ func (s *Service) synchronizeTree(ctx context.Context, cid cid.ID, from uint64,
|
|||
zap.Uint64("from", from))
|
||||
|
||||
errGroup, egCtx := errgroup.WithContext(ctx)
|
||||
const workersCount = 4
|
||||
const workersCount = 1024
|
||||
errGroup.SetLimit(workersCount)
|
||||
|
||||
heights := make([]uint64, len(nodes))
|
||||
nodeOperationStreams := make([]chan *pilorama.Move, len(nodes))
|
||||
for i := range nodeOperationStreams {
|
||||
nodeOperationStreams[i] = make(chan *pilorama.Move)
|
||||
}
|
||||
merged := make(chan *pilorama.Move)
|
||||
var minStreamedLastHeight uint64
|
||||
errGroup.Go(func() error {
|
||||
minStreamedLastHeight = mergeOperationStreams(ctx, nodeOperationStreams, merged)
|
||||
return nil
|
||||
})
|
||||
var minUnappliedHeight uint64
|
||||
errGroup.Go(func() error {
|
||||
minUnappliedHeight = s.applyOperationStream(ctx, cid, treeID, merged)
|
||||
return nil
|
||||
})
|
||||
|
||||
for i, n := range nodes {
|
||||
i := i
|
||||
n := n
|
||||
|
@ -164,7 +310,7 @@ func (s *Service) synchronizeTree(ctx context.Context, cid cid.ID, from uint64,
|
|||
|
||||
treeClient := NewTreeServiceClient(cc)
|
||||
for {
|
||||
h, err := s.synchronizeSingle(egCtx, cid, treeID, height, treeClient)
|
||||
h, err := s.startStream(egCtx, cid, treeID, from, treeClient, nodeOperationStreams[i])
|
||||
if height < h {
|
||||
height = h
|
||||
}
|
||||
|
@ -174,94 +320,23 @@ func (s *Service) synchronizeTree(ctx context.Context, cid cid.ID, from uint64,
|
|||
}
|
||||
}
|
||||
})
|
||||
|
||||
if height <= from { // do not increase starting height on fail
|
||||
heights[i] = from
|
||||
return nil
|
||||
}
|
||||
heights[i] = height
|
||||
close(nodeOperationStreams[i])
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
if err := errGroup.Wait(); err != nil {
|
||||
s.log.Warn(logs.TreeFailedToRunTreeSynchronizationOverAllNodes, zap.Error(err))
|
||||
}
|
||||
|
||||
newHeight := uint64(math.MaxUint64)
|
||||
for _, height := range heights { // take minimum across all clients
|
||||
if height < newHeight {
|
||||
newHeight = height
|
||||
}
|
||||
}
|
||||
if newHeight == math.MaxUint64 {
|
||||
newHeight = from
|
||||
newHeight := minStreamedLastHeight
|
||||
if newHeight > minUnappliedHeight {
|
||||
newHeight = minUnappliedHeight
|
||||
} else {
|
||||
newHeight++
|
||||
}
|
||||
return newHeight
|
||||
}
|
||||
|
||||
func (s *Service) synchronizeSingle(ctx context.Context, cid cid.ID, treeID string, height uint64, treeClient TreeServiceClient) (uint64, error) {
|
||||
rawCID := make([]byte, sha256.Size)
|
||||
cid.Encode(rawCID)
|
||||
|
||||
const treeApplyWorkersCount = 1024
|
||||
errGroup, egCtx := errgroup.WithContext(ctx)
|
||||
errGroup.SetLimit(treeApplyWorkersCount)
|
||||
var heightMtx sync.Mutex
|
||||
|
||||
for {
|
||||
newHeight := height
|
||||
req := &GetOpLogRequest{
|
||||
Body: &GetOpLogRequest_Body{
|
||||
ContainerId: rawCID,
|
||||
TreeId: treeID,
|
||||
Height: newHeight,
|
||||
},
|
||||
}
|
||||
if err := SignMessage(req, s.key); err != nil {
|
||||
return newHeight, err
|
||||
}
|
||||
|
||||
c, err := treeClient.GetOpLog(egCtx, req)
|
||||
if err != nil {
|
||||
return newHeight, fmt.Errorf("can't initialize client: %w", err)
|
||||
}
|
||||
|
||||
res, err := c.Recv()
|
||||
for ; err == nil; res, err = c.Recv() {
|
||||
lm := res.GetBody().GetOperation()
|
||||
m := &pilorama.Move{
|
||||
Parent: lm.ParentId,
|
||||
Child: lm.ChildId,
|
||||
}
|
||||
if err := m.Meta.FromBytes(lm.Meta); err != nil {
|
||||
_ = errGroup.Wait()
|
||||
return newHeight, err
|
||||
}
|
||||
errGroup.Go(func() error {
|
||||
if err := s.forest.TreeApply(egCtx, cid, treeID, m, true); err != nil {
|
||||
return err
|
||||
}
|
||||
heightMtx.Lock()
|
||||
if m.Time > newHeight {
|
||||
newHeight = m.Time + 1
|
||||
} else {
|
||||
newHeight++
|
||||
}
|
||||
heightMtx.Unlock()
|
||||
return nil
|
||||
})
|
||||
}
|
||||
if errGroupErr := errGroup.Wait(); errGroupErr != nil {
|
||||
return newHeight, err
|
||||
}
|
||||
if height == newHeight || err != nil && !errors.Is(err, io.EOF) {
|
||||
return newHeight, err
|
||||
}
|
||||
height = newHeight
|
||||
}
|
||||
}
|
||||
|
||||
// ErrAlreadySyncing is returned when a service synchronization has already
|
||||
// been started.
|
||||
var ErrAlreadySyncing = errors.New("service is being synchronized")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue