frostfs-node/pkg/local_object_storage/shard/rebuild.go

193 lines
4.1 KiB
Go
Raw Normal View History

package shard
import (
"context"
"errors"
"sync"
"git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/blobstor"
meta "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/metabase"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/logger"
"git.frostfs.info/TrueCloudLab/frostfs-observability/tracing"
oid "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object/id"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/trace"
"go.uber.org/zap"
)
var ErrRebuildInProgress = errors.New("shard rebuild in progress")
type RebuildWorkerLimiter interface {
AcquireWorkSlot(ctx context.Context) error
ReleaseWorkSlot()
}
type rebuildLimiter struct {
semaphore chan struct{}
}
func NewRebuildLimiter(workersCount uint32) RebuildWorkerLimiter {
return &rebuildLimiter{
semaphore: make(chan struct{}, workersCount),
}
}
func (l *rebuildLimiter) AcquireWorkSlot(ctx context.Context) error {
select {
case l.semaphore <- struct{}{}:
return nil
case <-ctx.Done():
return ctx.Err()
}
}
func (l *rebuildLimiter) ReleaseWorkSlot() {
<-l.semaphore
}
type rebuildTask struct {
limiter RebuildWorkerLimiter
fillPercent int
}
type rebuilder struct {
mtx *sync.Mutex
wg *sync.WaitGroup
cancel func()
done chan struct{}
tasks chan rebuildTask
}
func newRebuilder() *rebuilder {
return &rebuilder{
mtx: &sync.Mutex{},
wg: &sync.WaitGroup{},
tasks: make(chan rebuildTask),
}
}
func (r *rebuilder) Start(ctx context.Context, bs *blobstor.BlobStor, mb *meta.DB, log *logger.Logger) {
r.mtx.Lock()
defer r.mtx.Unlock()
if r.done != nil {
return // already started
}
ctx, cancel := context.WithCancel(ctx)
r.cancel = cancel
r.done = make(chan struct{})
r.wg.Add(1)
go func() {
defer r.wg.Done()
for {
select {
case <-r.done:
return
case t, ok := <-r.tasks:
if !ok {
continue
}
runRebuild(ctx, bs, mb, log, t.fillPercent, t.limiter)
}
}
}()
}
func runRebuild(ctx context.Context, bs *blobstor.BlobStor, mb *meta.DB, log *logger.Logger,
fillPercent int, limiter RebuildWorkerLimiter,
) {
select {
case <-ctx.Done():
return
default:
}
log.Info(ctx, logs.BlobstoreRebuildStarted)
if err := bs.Rebuild(ctx, &mbStorageIDUpdate{mb: mb}, limiter, fillPercent); err != nil {
log.Warn(ctx, logs.FailedToRebuildBlobstore, zap.Error(err))
} else {
log.Info(ctx, logs.BlobstoreRebuildCompletedSuccessfully)
}
}
func (r *rebuilder) ScheduleRebuild(ctx context.Context, limiter RebuildWorkerLimiter, fillPercent int,
) error {
select {
case <-ctx.Done():
return ctx.Err()
case r.tasks <- rebuildTask{
limiter: limiter,
fillPercent: fillPercent,
}:
return nil
default:
return ErrRebuildInProgress
}
}
func (r *rebuilder) Stop(log *logger.Logger) {
r.mtx.Lock()
defer r.mtx.Unlock()
if r.done != nil {
close(r.done)
}
if r.cancel != nil {
r.cancel()
}
r.wg.Wait()
r.cancel = nil
r.done = nil
log.Info(context.Background(), logs.BlobstoreRebuildStopped)
}
var errMBIsNotAvailable = errors.New("metabase is not available")
type mbStorageIDUpdate struct {
mb *meta.DB
}
func (u *mbStorageIDUpdate) UpdateStorageID(ctx context.Context, addr oid.Address, storageID []byte) error {
select {
case <-ctx.Done():
return ctx.Err()
default:
}
if u.mb == nil {
return errMBIsNotAvailable
}
var prm meta.UpdateStorageIDPrm
prm.SetAddress(addr)
prm.SetStorageID(storageID)
_, err := u.mb.UpdateStorageID(ctx, prm)
return err
}
type RebuildPrm struct {
ConcurrencyLimiter RebuildWorkerLimiter
TargetFillPercent uint32
}
func (s *Shard) ScheduleRebuild(ctx context.Context, p RebuildPrm) error {
ctx, span := tracing.StartSpanFromContext(ctx, "Shard.ScheduleRebuild",
trace.WithAttributes(
attribute.String("shard_id", s.ID().String()),
attribute.Int64("target_fill_percent", int64(p.TargetFillPercent)),
))
defer span.End()
s.m.RLock()
defer s.m.RUnlock()
if s.info.Mode.ReadOnly() {
return ErrReadOnlyMode
}
if s.info.Mode.NoMetabase() {
return ErrDegradedMode
}
return s.rb.ScheduleRebuild(ctx, p.ConcurrencyLimiter, int(p.TargetFillPercent))
}