[#329] node: Make evacuate async

Now it's possible to run evacuate shard in async.
Also only one evacuate process can be in progress.

Signed-off-by: Dmitrii Stepanov <d.stepanov@yadro.com>
This commit is contained in:
Dmitrii Stepanov 2023-05-04 13:58:26 +03:00 committed by Evgenii Stratonikov
parent 100b1b5128
commit e4889e06ba
11 changed files with 667 additions and 37 deletions

View file

@ -484,5 +484,9 @@ const (
ShardGCCollectingExpiredLocksCompleted = "collecting expired locks completed" ShardGCCollectingExpiredLocksCompleted = "collecting expired locks completed"
ShardGCRemoveGarbageStarted = "garbage remove started" ShardGCRemoveGarbageStarted = "garbage remove started"
ShardGCRemoveGarbageCompleted = "garbage remove completed" ShardGCRemoveGarbageCompleted = "garbage remove completed"
EngineShardsEvacuationFailedToCount = "failed to get total objects count to evacuate"
EngineShardsEvacuationFailedToListObjects = "failed to list objects to evacuate"
EngineShardsEvacuationFailedToReadObject = "failed to read object to evacuate"
EngineShardsEvacuationFailedToMoveObject = "failed to evacuate object to other node"
ShardGCFailedToGetExpiredWithLinked = "failed to get expired objects with linked" ShardGCFailedToGetExpiredWithLinked = "failed to get expired objects with linked"
) )

View file

@ -35,6 +35,7 @@ type StorageEngine struct {
err error err error
} }
evacuateLimiter *evacuationLimiter
} }
type shardWrapper struct { type shardWrapper struct {
@ -230,6 +231,9 @@ func New(opts ...Option) *StorageEngine {
shardPools: make(map[string]util.WorkerPool), shardPools: make(map[string]util.WorkerPool),
closeCh: make(chan struct{}), closeCh: make(chan struct{}),
setModeCh: make(chan setModeRequest), setModeCh: make(chan setModeRequest),
evacuateLimiter: &evacuationLimiter{
guard: &sync.RWMutex{},
},
} }
} }

View file

@ -5,6 +5,7 @@ import (
"errors" "errors"
"fmt" "fmt"
"git.frostfs.info/TrueCloudLab/frostfs-api-go/v2/pkg/tracing"
"git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs" "git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/core/object" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/core/object"
meta "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/metabase" meta "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/metabase"
@ -14,6 +15,9 @@ import (
objectSDK "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object" objectSDK "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object"
oid "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object/id" oid "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object/id"
"git.frostfs.info/TrueCloudLab/hrw" "git.frostfs.info/TrueCloudLab/hrw"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/trace"
"go.uber.org/atomic"
"go.uber.org/zap" "go.uber.org/zap"
) )
@ -24,11 +28,23 @@ type EvacuateShardPrm struct {
shardID []*shard.ID shardID []*shard.ID
handler func(context.Context, oid.Address, *objectSDK.Object) error handler func(context.Context, oid.Address, *objectSDK.Object) error
ignoreErrors bool ignoreErrors bool
async bool
} }
// EvacuateShardRes represents result of the EvacuateShard operation. // EvacuateShardRes represents result of the EvacuateShard operation.
type EvacuateShardRes struct { type EvacuateShardRes struct {
count int evacuated *atomic.Uint64
total *atomic.Uint64
failed *atomic.Uint64
}
// NewEvacuateShardRes creates new EvacuateShardRes instance.
func NewEvacuateShardRes() *EvacuateShardRes {
return &EvacuateShardRes{
evacuated: atomic.NewUint64(0),
total: atomic.NewUint64(0),
failed: atomic.NewUint64(0),
}
} }
// WithShardIDList sets shard ID. // WithShardIDList sets shard ID.
@ -46,10 +62,46 @@ func (p *EvacuateShardPrm) WithFaultHandler(f func(context.Context, oid.Address,
p.handler = f p.handler = f
} }
// Count returns amount of evacuated objects. // WithAsync sets flag to run evacuate async.
func (p *EvacuateShardPrm) WithAsync(async bool) {
p.async = async
}
// Evacuated returns amount of evacuated objects.
// Objects for which handler returned no error are also assumed evacuated. // Objects for which handler returned no error are also assumed evacuated.
func (p EvacuateShardRes) Count() int { func (p *EvacuateShardRes) Evacuated() uint64 {
return p.count if p == nil {
return 0
}
return p.evacuated.Load()
}
// Total returns total count objects to evacuate.
func (p *EvacuateShardRes) Total() uint64 {
if p == nil {
return 0
}
return p.total.Load()
}
// Failed returns count of failed objects to evacuate.
func (p *EvacuateShardRes) Failed() uint64 {
if p == nil {
return 0
}
return p.failed.Load()
}
// DeepCopy returns deep copy of result instance.
func (p *EvacuateShardRes) DeepCopy() *EvacuateShardRes {
if p == nil {
return nil
}
return &EvacuateShardRes{
evacuated: atomic.NewUint64(p.evacuated.Load()),
total: atomic.NewUint64(p.total.Load()),
failed: atomic.NewUint64(p.failed.Load()),
}
} }
const defaultEvacuateBatchSize = 100 const defaultEvacuateBatchSize = 100
@ -63,15 +115,29 @@ var errMustHaveTwoShards = errors.New("must have at least 1 spare shard")
// Evacuate moves data from one shard to the others. // Evacuate moves data from one shard to the others.
// The shard being moved must be in read-only mode. // The shard being moved must be in read-only mode.
func (e *StorageEngine) Evacuate(ctx context.Context, prm EvacuateShardPrm) (EvacuateShardRes, error) { func (e *StorageEngine) Evacuate(ctx context.Context, prm EvacuateShardPrm) (*EvacuateShardRes, error) {
select {
case <-ctx.Done():
return nil, ctx.Err()
default:
}
shardIDs := make([]string, len(prm.shardID)) shardIDs := make([]string, len(prm.shardID))
for i := range prm.shardID { for i := range prm.shardID {
shardIDs[i] = prm.shardID[i].String() shardIDs[i] = prm.shardID[i].String()
} }
ctx, span := tracing.StartSpanFromContext(ctx, "StorageEngine.Evacuate",
trace.WithAttributes(
attribute.StringSlice("shardIDs", shardIDs),
attribute.Bool("async", prm.async),
attribute.Bool("ignoreErrors", prm.ignoreErrors),
))
defer span.End()
shards, weights, err := e.getActualShards(shardIDs, prm.handler != nil) shards, weights, err := e.getActualShards(shardIDs, prm.handler != nil)
if err != nil { if err != nil {
return EvacuateShardRes{}, err return nil, err
} }
shardsToEvacuate := make(map[string]*shard.Shard) shardsToEvacuate := make(map[string]*shard.Shard)
@ -83,23 +149,91 @@ func (e *StorageEngine) Evacuate(ctx context.Context, prm EvacuateShardPrm) (Eva
} }
} }
res := NewEvacuateShardRes()
ctx = ctxOrBackground(ctx, prm.async)
eg, egCtx, err := e.evacuateLimiter.TryStart(ctx, shardIDs, res)
if err != nil {
return nil, err
}
eg.Go(func() error {
return e.evacuateShards(egCtx, shardIDs, prm, res, shards, weights, shardsToEvacuate)
})
if prm.async {
return nil, nil
}
return res, eg.Wait()
}
func ctxOrBackground(ctx context.Context, background bool) context.Context {
if background {
return context.Background()
}
return ctx
}
func (e *StorageEngine) evacuateShards(ctx context.Context, shardIDs []string, prm EvacuateShardPrm, res *EvacuateShardRes,
shards []pooledShard, weights []float64, shardsToEvacuate map[string]*shard.Shard) error {
var err error
ctx, span := tracing.StartSpanFromContext(ctx, "StorageEngine.evacuateShards",
trace.WithAttributes(
attribute.StringSlice("shardIDs", shardIDs),
attribute.Bool("async", prm.async),
attribute.Bool("ignoreErrors", prm.ignoreErrors),
))
defer func() {
span.End()
e.evacuateLimiter.Complete(err)
}()
e.log.Info(logs.EngineStartedShardsEvacuation, zap.Strings("shard_ids", shardIDs)) e.log.Info(logs.EngineStartedShardsEvacuation, zap.Strings("shard_ids", shardIDs))
var res EvacuateShardRes err = e.getTotalObjectsCount(ctx, shardsToEvacuate, res)
if err != nil {
e.log.Error(logs.EngineShardsEvacuationFailedToCount, zap.Strings("shard_ids", shardIDs), zap.Error(err))
return err
}
for _, shardID := range shardIDs { for _, shardID := range shardIDs {
if err = e.evacuateShard(ctx, shardID, prm, &res, shards, weights, shardsToEvacuate); err != nil { if err = e.evacuateShard(ctx, shardID, prm, res, shards, weights, shardsToEvacuate); err != nil {
e.log.Error(logs.EngineFinishedWithErrorShardsEvacuation, zap.Error(err), zap.Strings("shard_ids", shardIDs)) e.log.Error(logs.EngineFinishedWithErrorShardsEvacuation, zap.Error(err), zap.Strings("shard_ids", shardIDs))
return res, err return err
} }
} }
e.log.Info(logs.EngineFinishedSuccessfullyShardsEvacuation, zap.Strings("shard_ids", shardIDs)) e.log.Info(logs.EngineFinishedSuccessfullyShardsEvacuation, zap.Strings("shard_ids", shardIDs))
return res, nil return nil
}
func (e *StorageEngine) getTotalObjectsCount(ctx context.Context, shardsToEvacuate map[string]*shard.Shard, res *EvacuateShardRes) error {
ctx, span := tracing.StartSpanFromContext(ctx, "StorageEngine.getTotalObjectsCount")
defer span.End()
for _, sh := range shardsToEvacuate {
cnt, err := sh.LogicalObjectsCount(ctx)
if err != nil {
if errors.Is(err, shard.ErrDegradedMode) {
continue
}
return err
}
res.total.Add(cnt)
}
return nil
} }
func (e *StorageEngine) evacuateShard(ctx context.Context, shardID string, prm EvacuateShardPrm, res *EvacuateShardRes, func (e *StorageEngine) evacuateShard(ctx context.Context, shardID string, prm EvacuateShardPrm, res *EvacuateShardRes,
shards []pooledShard, weights []float64, shardsToEvacuate map[string]*shard.Shard) error { shards []pooledShard, weights []float64, shardsToEvacuate map[string]*shard.Shard) error {
ctx, span := tracing.StartSpanFromContext(ctx, "StorageEngine.evacuateShard",
trace.WithAttributes(
attribute.String("shardID", shardID),
))
defer span.End()
var listPrm shard.ListWithCursorPrm var listPrm shard.ListWithCursorPrm
listPrm.WithCount(defaultEvacuateBatchSize) listPrm.WithCount(defaultEvacuateBatchSize)
@ -116,6 +250,7 @@ func (e *StorageEngine) evacuateShard(ctx context.Context, shardID string, prm E
if errors.Is(err, meta.ErrEndOfListing) || errors.Is(err, shard.ErrDegradedMode) { if errors.Is(err, meta.ErrEndOfListing) || errors.Is(err, shard.ErrDegradedMode) {
break break
} }
e.log.Error(logs.EngineShardsEvacuationFailedToListObjects, zap.String("shard_id", shardID), zap.Error(err))
return err return err
} }
@ -168,6 +303,12 @@ func (e *StorageEngine) getActualShards(shardIDs []string, handlerDefined bool)
func (e *StorageEngine) evacuateObjects(ctx context.Context, sh *shard.Shard, toEvacuate []object.AddressWithType, prm EvacuateShardPrm, res *EvacuateShardRes, func (e *StorageEngine) evacuateObjects(ctx context.Context, sh *shard.Shard, toEvacuate []object.AddressWithType, prm EvacuateShardPrm, res *EvacuateShardRes,
shards []pooledShard, weights []float64, shardsToEvacuate map[string]*shard.Shard) error { shards []pooledShard, weights []float64, shardsToEvacuate map[string]*shard.Shard) error {
ctx, span := tracing.StartSpanFromContext(ctx, "StorageEngine.evacuateObjects",
trace.WithAttributes(
attribute.Int("objects_count", len(toEvacuate)),
))
defer span.End()
for i := range toEvacuate { for i := range toEvacuate {
select { select {
case <-ctx.Done(): case <-ctx.Done():
@ -182,12 +323,14 @@ func (e *StorageEngine) evacuateObjects(ctx context.Context, sh *shard.Shard, to
getRes, err := sh.Get(ctx, getPrm) getRes, err := sh.Get(ctx, getPrm)
if err != nil { if err != nil {
if prm.ignoreErrors { if prm.ignoreErrors {
res.failed.Inc()
continue continue
} }
e.log.Error(logs.EngineShardsEvacuationFailedToReadObject, zap.String("address", addr.EncodeToString()), zap.Error(err))
return err return err
} }
evacuatedLocal, err := e.tryEvacuateObjectLocal(ctx, addr, getRes.Object(), sh, res, shards, weights, shardsToEvacuate) evacuatedLocal, err := e.tryEvacuateObjectLocal(ctx, addr, getRes.Object(), sh, shards, weights, shardsToEvacuate, res)
if err != nil { if err != nil {
return err return err
} }
@ -204,15 +347,16 @@ func (e *StorageEngine) evacuateObjects(ctx context.Context, sh *shard.Shard, to
err = prm.handler(ctx, addr, getRes.Object()) err = prm.handler(ctx, addr, getRes.Object())
if err != nil { if err != nil {
e.log.Error(logs.EngineShardsEvacuationFailedToMoveObject, zap.String("address", addr.EncodeToString()), zap.Error(err))
return err return err
} }
res.count++ res.evacuated.Inc()
} }
return nil return nil
} }
func (e *StorageEngine) tryEvacuateObjectLocal(ctx context.Context, addr oid.Address, object *objectSDK.Object, sh *shard.Shard, res *EvacuateShardRes, func (e *StorageEngine) tryEvacuateObjectLocal(ctx context.Context, addr oid.Address, object *objectSDK.Object, sh *shard.Shard,
shards []pooledShard, weights []float64, shardsToEvacuate map[string]*shard.Shard) (bool, error) { shards []pooledShard, weights []float64, shardsToEvacuate map[string]*shard.Shard, res *EvacuateShardRes) (bool, error) {
hrw.SortHasherSliceByWeightValue(shards, weights, hrw.Hash([]byte(addr.EncodeToString()))) hrw.SortHasherSliceByWeightValue(shards, weights, hrw.Hash([]byte(addr.EncodeToString())))
for j := range shards { for j := range shards {
select { select {
@ -227,11 +371,11 @@ func (e *StorageEngine) tryEvacuateObjectLocal(ctx context.Context, addr oid.Add
putDone, exists := e.putToShard(ctx, shards[j].hashedShard, j, shards[j].pool, addr, object) putDone, exists := e.putToShard(ctx, shards[j].hashedShard, j, shards[j].pool, addr, object)
if putDone || exists { if putDone || exists {
if putDone { if putDone {
res.evacuated.Inc()
e.log.Debug(logs.EngineObjectIsMovedToAnotherShard, e.log.Debug(logs.EngineObjectIsMovedToAnotherShard,
zap.Stringer("from", sh.ID()), zap.Stringer("from", sh.ID()),
zap.Stringer("to", shards[j].ID()), zap.Stringer("to", shards[j].ID()),
zap.Stringer("addr", addr)) zap.Stringer("addr", addr))
res.count++
} }
return true, nil return true, nil
} }
@ -239,3 +383,23 @@ func (e *StorageEngine) tryEvacuateObjectLocal(ctx context.Context, addr oid.Add
return false, nil return false, nil
} }
func (e *StorageEngine) GetEvacuationState(ctx context.Context) (*EvacuationState, error) {
select {
case <-ctx.Done():
return nil, ctx.Err()
default:
}
return e.evacuateLimiter.GetState(), nil
}
func (e *StorageEngine) EnqueRunningEvacuationStop(ctx context.Context) error {
select {
case <-ctx.Done():
return ctx.Err()
default:
}
return e.evacuateLimiter.CancelIfRunning()
}

View file

@ -0,0 +1,178 @@
package engine
import (
"context"
"fmt"
"sync"
"time"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/util/logicerr"
"golang.org/x/sync/errgroup"
)
type EvacuateProcessState int
const (
EvacuateProcessStateUndefined EvacuateProcessState = iota
EvacuateProcessStateRunning
EvacuateProcessStateCompleted
)
type EvacuationState struct {
shardIDs []string
processState EvacuateProcessState
startedAt time.Time
finishedAt time.Time
result *EvacuateShardRes
errMessage string
}
func (s *EvacuationState) ShardIDs() []string {
if s == nil {
return nil
}
return s.shardIDs
}
func (s *EvacuationState) Evacuated() uint64 {
if s == nil {
return 0
}
return s.result.Evacuated()
}
func (s *EvacuationState) Total() uint64 {
if s == nil {
return 0
}
return s.result.Total()
}
func (s *EvacuationState) Failed() uint64 {
if s == nil {
return 0
}
return s.result.Failed()
}
func (s *EvacuationState) ProcessingStatus() EvacuateProcessState {
if s == nil {
return EvacuateProcessStateUndefined
}
return s.processState
}
func (s *EvacuationState) StartedAt() *time.Time {
if s == nil {
return nil
}
defaultTime := time.Time{}
if s.startedAt == defaultTime {
return nil
}
return &s.startedAt
}
func (s *EvacuationState) FinishedAt() *time.Time {
if s == nil {
return nil
}
defaultTime := time.Time{}
if s.finishedAt == defaultTime {
return nil
}
return &s.finishedAt
}
func (s *EvacuationState) ErrorMessage() string {
if s == nil {
return ""
}
return s.errMessage
}
func (s *EvacuationState) DeepCopy() *EvacuationState {
if s == nil {
return nil
}
shardIDs := make([]string, len(s.shardIDs))
copy(shardIDs, s.shardIDs)
return &EvacuationState{
shardIDs: shardIDs,
processState: s.processState,
startedAt: s.startedAt,
finishedAt: s.finishedAt,
errMessage: s.errMessage,
result: s.result.DeepCopy(),
}
}
type evacuationLimiter struct {
state EvacuationState
eg *errgroup.Group
cancel context.CancelFunc
guard *sync.RWMutex
}
func (l *evacuationLimiter) TryStart(ctx context.Context, shardIDs []string, result *EvacuateShardRes) (*errgroup.Group, context.Context, error) {
l.guard.Lock()
defer l.guard.Unlock()
select {
case <-ctx.Done():
return nil, nil, ctx.Err()
default:
}
if l.state.processState == EvacuateProcessStateRunning {
return nil, nil, logicerr.New(fmt.Sprintf("evacuate is already running for shard ids %v", l.state.shardIDs))
}
var egCtx context.Context
egCtx, l.cancel = context.WithCancel(ctx)
l.eg, egCtx = errgroup.WithContext(egCtx)
l.state = EvacuationState{
shardIDs: shardIDs,
processState: EvacuateProcessStateRunning,
startedAt: time.Now().UTC(),
result: result,
}
return l.eg, egCtx, nil
}
func (l *evacuationLimiter) Complete(err error) {
l.guard.Lock()
defer l.guard.Unlock()
errMsq := ""
if err != nil {
errMsq = err.Error()
}
l.state.processState = EvacuateProcessStateCompleted
l.state.errMessage = errMsq
l.state.finishedAt = time.Now().UTC()
l.eg = nil
}
func (l *evacuationLimiter) GetState() *EvacuationState {
l.guard.RLock()
defer l.guard.RUnlock()
return l.state.DeepCopy()
}
func (l *evacuationLimiter) CancelIfRunning() error {
l.guard.Lock()
defer l.guard.Unlock()
if l.state.processState != EvacuateProcessStateRunning {
return logicerr.New("there is no running evacuation task")
}
l.cancel()
return nil
}

View file

@ -7,6 +7,7 @@ import (
"path/filepath" "path/filepath"
"strconv" "strconv"
"testing" "testing"
"time"
objectCore "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/core/object" objectCore "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/core/object"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/blobstor" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/blobstor"
@ -21,6 +22,7 @@ import (
oid "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object/id" oid "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object/id"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"go.uber.org/zap/zaptest" "go.uber.org/zap/zaptest"
"golang.org/x/sync/errgroup"
) )
func newEngineEvacuate(t *testing.T, shardNum int, objPerShard int) (*StorageEngine, []*shard.ID, []*objectSDK.Object) { func newEngineEvacuate(t *testing.T, shardNum int, objPerShard int) (*StorageEngine, []*shard.ID, []*objectSDK.Object) {
@ -103,14 +105,14 @@ func TestEvacuateShard(t *testing.T) {
t.Run("must be read-only", func(t *testing.T) { t.Run("must be read-only", func(t *testing.T) {
res, err := e.Evacuate(context.Background(), prm) res, err := e.Evacuate(context.Background(), prm)
require.ErrorIs(t, err, ErrMustBeReadOnly) require.ErrorIs(t, err, ErrMustBeReadOnly)
require.Equal(t, 0, res.Count()) require.Equal(t, uint64(0), res.Evacuated())
}) })
require.NoError(t, e.shards[evacuateShardID].SetMode(mode.ReadOnly)) require.NoError(t, e.shards[evacuateShardID].SetMode(mode.ReadOnly))
res, err := e.Evacuate(context.Background(), prm) res, err := e.Evacuate(context.Background(), prm)
require.NoError(t, err) require.NoError(t, err)
require.Equal(t, objPerShard, res.count) require.Equal(t, uint64(objPerShard), res.Evacuated())
// We check that all objects are available both before and after shard removal. // We check that all objects are available both before and after shard removal.
// First case is a real-world use-case. It ensures that an object can be put in presense // First case is a real-world use-case. It ensures that an object can be put in presense
@ -121,7 +123,7 @@ func TestEvacuateShard(t *testing.T) {
// Calling it again is OK, but all objects are already moved, so no new PUTs should be done. // Calling it again is OK, but all objects are already moved, so no new PUTs should be done.
res, err = e.Evacuate(context.Background(), prm) res, err = e.Evacuate(context.Background(), prm)
require.NoError(t, err) require.NoError(t, err)
require.Equal(t, 0, res.count) require.Equal(t, uint64(0), res.Evacuated())
checkHasObjects(t) checkHasObjects(t)
@ -138,8 +140,8 @@ func TestEvacuateNetwork(t *testing.T) {
var errReplication = errors.New("handler error") var errReplication = errors.New("handler error")
acceptOneOf := func(objects []*objectSDK.Object, max int) func(context.Context, oid.Address, *objectSDK.Object) error { acceptOneOf := func(objects []*objectSDK.Object, max uint64) func(context.Context, oid.Address, *objectSDK.Object) error {
var n int var n uint64
return func(_ context.Context, addr oid.Address, obj *objectSDK.Object) error { return func(_ context.Context, addr oid.Address, obj *objectSDK.Object) error {
if n == max { if n == max {
return errReplication return errReplication
@ -169,13 +171,13 @@ func TestEvacuateNetwork(t *testing.T) {
res, err := e.Evacuate(context.Background(), prm) res, err := e.Evacuate(context.Background(), prm)
require.ErrorIs(t, err, errMustHaveTwoShards) require.ErrorIs(t, err, errMustHaveTwoShards)
require.Equal(t, 0, res.Count()) require.Equal(t, uint64(0), res.Evacuated())
prm.handler = acceptOneOf(objects, 2) prm.handler = acceptOneOf(objects, 2)
res, err = e.Evacuate(context.Background(), prm) res, err = e.Evacuate(context.Background(), prm)
require.ErrorIs(t, err, errReplication) require.ErrorIs(t, err, errReplication)
require.Equal(t, 2, res.Count()) require.Equal(t, uint64(2), res.Evacuated())
}) })
t.Run("multiple shards, evacuate one", func(t *testing.T) { t.Run("multiple shards, evacuate one", func(t *testing.T) {
t.Parallel() t.Parallel()
@ -190,14 +192,14 @@ func TestEvacuateNetwork(t *testing.T) {
res, err := e.Evacuate(context.Background(), prm) res, err := e.Evacuate(context.Background(), prm)
require.ErrorIs(t, err, errReplication) require.ErrorIs(t, err, errReplication)
require.Equal(t, 2, res.Count()) require.Equal(t, uint64(2), res.Evacuated())
t.Run("no errors", func(t *testing.T) { t.Run("no errors", func(t *testing.T) {
prm.handler = acceptOneOf(objects, 3) prm.handler = acceptOneOf(objects, 3)
res, err := e.Evacuate(context.Background(), prm) res, err := e.Evacuate(context.Background(), prm)
require.NoError(t, err) require.NoError(t, err)
require.Equal(t, 3, res.Count()) require.Equal(t, uint64(3), res.Evacuated())
}) })
}) })
t.Run("multiple shards, evacuate many", func(t *testing.T) { t.Run("multiple shards, evacuate many", func(t *testing.T) {
@ -205,12 +207,12 @@ func TestEvacuateNetwork(t *testing.T) {
e, ids, objects := newEngineEvacuate(t, 4, 5) e, ids, objects := newEngineEvacuate(t, 4, 5)
evacuateIDs := ids[0:3] evacuateIDs := ids[0:3]
var totalCount int var totalCount uint64
for i := range evacuateIDs { for i := range evacuateIDs {
res, err := e.shards[ids[i].String()].List() res, err := e.shards[ids[i].String()].List()
require.NoError(t, err) require.NoError(t, err)
totalCount += len(res.AddressList()) totalCount += uint64(len(res.AddressList()))
} }
for i := range ids { for i := range ids {
@ -223,14 +225,14 @@ func TestEvacuateNetwork(t *testing.T) {
res, err := e.Evacuate(context.Background(), prm) res, err := e.Evacuate(context.Background(), prm)
require.ErrorIs(t, err, errReplication) require.ErrorIs(t, err, errReplication)
require.Equal(t, totalCount-1, res.Count()) require.Equal(t, totalCount-1, res.Evacuated())
t.Run("no errors", func(t *testing.T) { t.Run("no errors", func(t *testing.T) {
prm.handler = acceptOneOf(objects, totalCount) prm.handler = acceptOneOf(objects, totalCount)
res, err := e.Evacuate(context.Background(), prm) res, err := e.Evacuate(context.Background(), prm)
require.NoError(t, err) require.NoError(t, err)
require.Equal(t, totalCount, res.Count()) require.Equal(t, totalCount, res.Evacuated())
}) })
}) })
} }
@ -258,5 +260,114 @@ func TestEvacuateCancellation(t *testing.T) {
res, err := e.Evacuate(ctx, prm) res, err := e.Evacuate(ctx, prm)
require.ErrorContains(t, err, "context canceled") require.ErrorContains(t, err, "context canceled")
require.Equal(t, 0, res.Count()) require.Equal(t, uint64(0), res.Evacuated())
}
func TestEvacuateSingleProcess(t *testing.T) {
e, ids, _ := newEngineEvacuate(t, 2, 3)
require.NoError(t, e.shards[ids[0].String()].SetMode(mode.ReadOnly))
require.NoError(t, e.shards[ids[1].String()].SetMode(mode.ReadOnly))
blocker := make(chan interface{})
running := make(chan interface{})
var prm EvacuateShardPrm
prm.shardID = ids[1:2]
prm.handler = func(ctx context.Context, a oid.Address, o *objectSDK.Object) error {
select {
case <-running:
default:
close(running)
}
<-blocker
return nil
}
eg, egCtx := errgroup.WithContext(context.Background())
eg.Go(func() error {
res, err := e.Evacuate(egCtx, prm)
require.NoError(t, err, "first evacuation failed")
require.Equal(t, uint64(3), res.Evacuated())
return nil
})
eg.Go(func() error {
<-running
res, err := e.Evacuate(egCtx, prm)
require.ErrorContains(t, err, "evacuate is already running for shard ids", "second evacuation not failed")
require.Equal(t, uint64(0), res.Evacuated())
close(blocker)
return nil
})
require.NoError(t, eg.Wait())
}
func TestEvacuateAsync(t *testing.T) {
e, ids, _ := newEngineEvacuate(t, 2, 3)
require.NoError(t, e.shards[ids[0].String()].SetMode(mode.ReadOnly))
require.NoError(t, e.shards[ids[1].String()].SetMode(mode.ReadOnly))
blocker := make(chan interface{})
running := make(chan interface{})
var prm EvacuateShardPrm
prm.shardID = ids[1:2]
prm.handler = func(ctx context.Context, a oid.Address, o *objectSDK.Object) error {
select {
case <-running:
default:
close(running)
}
<-blocker
return nil
}
st, err := e.GetEvacuationState(context.Background())
require.NoError(t, err, "get init state failed")
require.Equal(t, EvacuateProcessStateUndefined, st.ProcessingStatus(), "invalid init state")
require.Equal(t, uint64(0), st.Evacuated(), "invalid init count")
require.Nil(t, st.StartedAt(), "invalid init started at")
require.Nil(t, st.FinishedAt(), "invalid init finished at")
require.ElementsMatch(t, []string{}, st.ShardIDs(), "invalid init shard ids")
require.Equal(t, "", st.ErrorMessage(), "invalid init error message")
eg, egCtx := errgroup.WithContext(context.Background())
eg.Go(func() error {
res, err := e.Evacuate(egCtx, prm)
require.NoError(t, err, "first evacuation failed")
require.Equal(t, uint64(3), res.Evacuated())
return nil
})
<-running
st, err = e.GetEvacuationState(context.Background())
require.NoError(t, err, "get running state failed")
require.Equal(t, EvacuateProcessStateRunning, st.ProcessingStatus(), "invalid running state")
require.Equal(t, uint64(0), st.Evacuated(), "invalid running count")
require.NotNil(t, st.StartedAt(), "invalid running started at")
require.Nil(t, st.FinishedAt(), "invalid init finished at")
expectedShardIDs := make([]string, 0, 2)
for _, id := range ids[1:2] {
expectedShardIDs = append(expectedShardIDs, id.String())
}
require.ElementsMatch(t, expectedShardIDs, st.ShardIDs(), "invalid running shard ids")
require.Equal(t, "", st.ErrorMessage(), "invalid init error message")
close(blocker)
require.Eventually(t, func() bool {
st, err = e.GetEvacuationState(context.Background())
return st.ProcessingStatus() == EvacuateProcessStateCompleted
}, 3*time.Second, 10*time.Millisecond, "invalid final state")
require.NoError(t, err, "get final state failed")
require.Equal(t, uint64(3), st.Evacuated(), "invalid final count")
require.NotNil(t, st.StartedAt(), "invalid final started at")
require.NotNil(t, st.FinishedAt(), "invalid final finished at")
require.ElementsMatch(t, expectedShardIDs, st.ShardIDs(), "invalid final shard ids")
require.Equal(t, "", st.ErrorMessage(), "invalid final error message")
require.NoError(t, eg.Wait())
} }

View file

@ -0,0 +1,31 @@
package shard
import (
"context"
"git.frostfs.info/TrueCloudLab/frostfs-api-go/v2/pkg/tracing"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/trace"
)
// LogicalObjectsCount returns logical objects count.
func (s *Shard) LogicalObjectsCount(ctx context.Context) (uint64, error) {
_, span := tracing.StartSpanFromContext(ctx, "Shard.LogicalObjectsCount",
trace.WithAttributes(
attribute.String("shard_id", s.ID().String()),
))
defer span.End()
s.m.RLock()
defer s.m.RUnlock()
if s.GetMode().NoMetabase() {
return 0, ErrDegradedMode
}
cc, err := s.metaBase.ObjectCounters()
if err != nil {
return 0, err
}
return cc.Logic(), nil
}

View file

@ -0,0 +1,60 @@
package control
import (
"fmt"
"time"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/engine"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/control"
"github.com/mr-tron/base58"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
)
func stateToResponse(state *engine.EvacuationState) (*control.GetShardEvacuationStatusResponse, error) {
shardIDs := make([][]byte, 0, len(state.ShardIDs()))
for _, shID := range state.ShardIDs() {
id, err := base58.Decode(shID)
if err != nil {
return nil, status.Error(codes.Internal, fmt.Sprintf("invalid shard id format: %s", shID))
}
shardIDs = append(shardIDs, id)
}
var evacStatus control.GetShardEvacuationStatusResponse_Body_Status
switch state.ProcessingStatus() {
case engine.EvacuateProcessStateRunning:
evacStatus = control.GetShardEvacuationStatusResponse_Body_RUNNING
case engine.EvacuateProcessStateCompleted:
evacStatus = control.GetShardEvacuationStatusResponse_Body_COMPLETED
default:
evacStatus = control.GetShardEvacuationStatusResponse_Body_EVACUATE_SHARD_STATUS_UNDEFINED
}
var startedAt *control.GetShardEvacuationStatusResponse_Body_UnixTimestamp
if state.StartedAt() != nil {
startedAt = &control.GetShardEvacuationStatusResponse_Body_UnixTimestamp{
Value: state.StartedAt().Unix(),
}
}
var duration *control.GetShardEvacuationStatusResponse_Body_Duration
if state.StartedAt() != nil {
end := time.Now().UTC()
if state.FinishedAt() != nil {
end = *state.FinishedAt()
}
duration = &control.GetShardEvacuationStatusResponse_Body_Duration{
Seconds: int64(end.Sub(*state.StartedAt()).Seconds()),
}
}
return &control.GetShardEvacuationStatusResponse{
Body: &control.GetShardEvacuationStatusResponse_Body{
Shard_ID: shardIDs,
Evacuated: state.Evacuated(),
Total: state.Total(),
Failed: state.Failed(),
Status: evacStatus,
StartedAt: startedAt,
Duration: duration,
ErrorMessage: state.ErrorMessage(),
},
}, nil
}

View file

@ -37,7 +37,7 @@ func (s *Server) EvacuateShard(ctx context.Context, req *control.EvacuateShardRe
resp := &control.EvacuateShardResponse{ resp := &control.EvacuateShardResponse{
Body: &control.EvacuateShardResponse_Body{ Body: &control.EvacuateShardResponse_Body{
Count: uint32(res.Count()), Count: uint32(res.Evacuated()),
}, },
} }

View file

@ -2,19 +2,96 @@ package control
import ( import (
"context" "context"
"fmt" "errors"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/engine"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/util/logicerr"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/control" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/control"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
) )
func (s *Server) StartShardEvacuation(context.Context, *control.StartShardEvacuationRequest) (*control.StartShardEvacuationResponse, error) { func (s *Server) StartShardEvacuation(ctx context.Context, req *control.StartShardEvacuationRequest) (*control.StartShardEvacuationResponse, error) {
return nil, fmt.Errorf("not implemented") err := s.isValidRequest(req)
if err != nil {
return nil, status.Error(codes.PermissionDenied, err.Error())
} }
func (s *Server) GetShardEvacuationStatus(context.Context, *control.GetShardEvacuationStatusRequest) (*control.GetShardEvacuationStatusResponse, error) { var prm engine.EvacuateShardPrm
return nil, fmt.Errorf("not implemented") prm.WithShardIDList(s.getShardIDList(req.GetBody().GetShard_ID()))
prm.WithIgnoreErrors(req.GetBody().GetIgnoreErrors())
prm.WithFaultHandler(s.replicate)
prm.WithAsync(true)
_, err = s.s.Evacuate(ctx, prm)
if err != nil {
var logicalErr logicerr.Logical
if errors.As(err, &logicalErr) {
return nil, status.Error(codes.Aborted, err.Error())
}
return nil, status.Error(codes.Internal, err.Error())
} }
func (s *Server) StopShardEvacuation(context.Context, *control.StopShardEvacuationRequest) (*control.StopShardEvacuationResponse, error) { resp := &control.StartShardEvacuationResponse{
return nil, fmt.Errorf("not implemented") Body: &control.StartShardEvacuationResponse_Body{},
}
err = SignMessage(s.key, resp)
if err != nil {
return nil, status.Error(codes.Internal, err.Error())
}
return resp, nil
}
func (s *Server) GetShardEvacuationStatus(ctx context.Context, req *control.GetShardEvacuationStatusRequest) (*control.GetShardEvacuationStatusResponse, error) {
err := s.isValidRequest(req)
if err != nil {
return nil, status.Error(codes.PermissionDenied, err.Error())
}
state, err := s.s.GetEvacuationState(ctx)
if err != nil {
var logicalErr logicerr.Logical
if errors.As(err, &logicalErr) {
return nil, status.Error(codes.Aborted, err.Error())
}
return nil, status.Error(codes.Internal, err.Error())
}
resp, err := stateToResponse(state)
if err != nil {
return nil, err
}
err = SignMessage(s.key, resp)
if err != nil {
return nil, status.Error(codes.Internal, err.Error())
}
return resp, nil
}
func (s *Server) StopShardEvacuation(ctx context.Context, req *control.StopShardEvacuationRequest) (*control.StopShardEvacuationResponse, error) {
err := s.isValidRequest(req)
if err != nil {
return nil, status.Error(codes.PermissionDenied, err.Error())
}
err = s.s.EnqueRunningEvacuationStop(ctx)
if err != nil {
var logicalErr logicerr.Logical
if errors.As(err, &logicalErr) {
return nil, status.Error(codes.Aborted, err.Error())
}
return nil, status.Error(codes.Internal, err.Error())
}
resp := &control.StopShardEvacuationResponse{
Body: &control.StopShardEvacuationResponse_Body{},
}
err = SignMessage(s.key, resp)
if err != nil {
return nil, status.Error(codes.Internal, err.Error())
}
return resp, nil
} }

View file

@ -27,6 +27,7 @@ service ControlService {
rpc SynchronizeTree (SynchronizeTreeRequest) returns (SynchronizeTreeResponse); rpc SynchronizeTree (SynchronizeTreeRequest) returns (SynchronizeTreeResponse);
// EvacuateShard moves all data from one shard to the others. // EvacuateShard moves all data from one shard to the others.
// Deprecated: Use StartShardEvacuation/GetShardEvacuationStatus/StopShardEvacuation
rpc EvacuateShard (EvacuateShardRequest) returns (EvacuateShardResponse); rpc EvacuateShard (EvacuateShardRequest) returns (EvacuateShardResponse);
// StartShardEvacuation starts moving all data from one shard to the others. // StartShardEvacuation starts moving all data from one shard to the others.

Binary file not shown.