Dmitrii Stepanov
d3b209c8e1
All checks were successful
Vulncheck / Vulncheck (pull_request) Successful in 1m3s
DCO action / DCO (pull_request) Successful in 1m28s
Pre-commit hooks / Pre-commit (pull_request) Successful in 2m2s
Build / Build Components (pull_request) Successful in 2m15s
Tests and linters / Run gofumpt (pull_request) Successful in 2m10s
Tests and linters / gopls check (pull_request) Successful in 2m21s
Tests and linters / Staticcheck (pull_request) Successful in 2m55s
Tests and linters / Lint (pull_request) Successful in 3m29s
Tests and linters / Tests (pull_request) Successful in 3m47s
Tests and linters / Tests with -race (pull_request) Successful in 3m58s
Since `frostfs-cli control shards rebuild` command was added, there is no need for background rebuild now. For failover tests used used value 1 to rebuild only schema change. Signed-off-by: Dmitrii Stepanov <d.stepanov@yadro.com>
192 lines
4.1 KiB
Go
192 lines
4.1 KiB
Go
package shard
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"sync"
|
|
|
|
"git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs"
|
|
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/blobstor"
|
|
meta "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/metabase"
|
|
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/logger"
|
|
"git.frostfs.info/TrueCloudLab/frostfs-observability/tracing"
|
|
oid "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object/id"
|
|
"go.opentelemetry.io/otel/attribute"
|
|
"go.opentelemetry.io/otel/trace"
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
var ErrRebuildInProgress = errors.New("shard rebuild in progress")
|
|
|
|
type RebuildWorkerLimiter interface {
|
|
AcquireWorkSlot(ctx context.Context) error
|
|
ReleaseWorkSlot()
|
|
}
|
|
|
|
type rebuildLimiter struct {
|
|
semaphore chan struct{}
|
|
}
|
|
|
|
func NewRebuildLimiter(workersCount uint32) RebuildWorkerLimiter {
|
|
return &rebuildLimiter{
|
|
semaphore: make(chan struct{}, workersCount),
|
|
}
|
|
}
|
|
|
|
func (l *rebuildLimiter) AcquireWorkSlot(ctx context.Context) error {
|
|
select {
|
|
case l.semaphore <- struct{}{}:
|
|
return nil
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
}
|
|
}
|
|
|
|
func (l *rebuildLimiter) ReleaseWorkSlot() {
|
|
<-l.semaphore
|
|
}
|
|
|
|
type rebuildTask struct {
|
|
limiter RebuildWorkerLimiter
|
|
fillPercent int
|
|
}
|
|
|
|
type rebuilder struct {
|
|
mtx *sync.Mutex
|
|
wg *sync.WaitGroup
|
|
cancel func()
|
|
done chan struct{}
|
|
tasks chan rebuildTask
|
|
}
|
|
|
|
func newRebuilder() *rebuilder {
|
|
return &rebuilder{
|
|
mtx: &sync.Mutex{},
|
|
wg: &sync.WaitGroup{},
|
|
tasks: make(chan rebuildTask),
|
|
}
|
|
}
|
|
|
|
func (r *rebuilder) Start(ctx context.Context, bs *blobstor.BlobStor, mb *meta.DB, log *logger.Logger) {
|
|
r.mtx.Lock()
|
|
defer r.mtx.Unlock()
|
|
|
|
if r.done != nil {
|
|
return // already started
|
|
}
|
|
ctx, cancel := context.WithCancel(ctx)
|
|
r.cancel = cancel
|
|
r.done = make(chan struct{})
|
|
r.wg.Add(1)
|
|
go func() {
|
|
defer r.wg.Done()
|
|
for {
|
|
select {
|
|
case <-r.done:
|
|
return
|
|
case t, ok := <-r.tasks:
|
|
if !ok {
|
|
continue
|
|
}
|
|
runRebuild(ctx, bs, mb, log, t.fillPercent, t.limiter)
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
func runRebuild(ctx context.Context, bs *blobstor.BlobStor, mb *meta.DB, log *logger.Logger,
|
|
fillPercent int, limiter RebuildWorkerLimiter,
|
|
) {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
default:
|
|
}
|
|
log.Info(logs.BlobstoreRebuildStarted)
|
|
if err := bs.Rebuild(ctx, &mbStorageIDUpdate{mb: mb}, limiter, fillPercent); err != nil {
|
|
log.Warn(logs.FailedToRebuildBlobstore, zap.Error(err))
|
|
} else {
|
|
log.Info(logs.BlobstoreRebuildCompletedSuccessfully)
|
|
}
|
|
}
|
|
|
|
func (r *rebuilder) ScheduleRebuild(ctx context.Context, limiter RebuildWorkerLimiter, fillPercent int,
|
|
) error {
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
case r.tasks <- rebuildTask{
|
|
limiter: limiter,
|
|
fillPercent: fillPercent,
|
|
}:
|
|
return nil
|
|
default:
|
|
return ErrRebuildInProgress
|
|
}
|
|
}
|
|
|
|
func (r *rebuilder) Stop(log *logger.Logger) {
|
|
r.mtx.Lock()
|
|
defer r.mtx.Unlock()
|
|
|
|
if r.done != nil {
|
|
close(r.done)
|
|
}
|
|
if r.cancel != nil {
|
|
r.cancel()
|
|
}
|
|
r.wg.Wait()
|
|
r.cancel = nil
|
|
r.done = nil
|
|
log.Info(logs.BlobstoreRebuildStopped)
|
|
}
|
|
|
|
var errMBIsNotAvailable = errors.New("metabase is not available")
|
|
|
|
type mbStorageIDUpdate struct {
|
|
mb *meta.DB
|
|
}
|
|
|
|
func (u *mbStorageIDUpdate) UpdateStorageID(ctx context.Context, addr oid.Address, storageID []byte) error {
|
|
select {
|
|
case <-ctx.Done():
|
|
return ctx.Err()
|
|
default:
|
|
}
|
|
|
|
if u.mb == nil {
|
|
return errMBIsNotAvailable
|
|
}
|
|
|
|
var prm meta.UpdateStorageIDPrm
|
|
prm.SetAddress(addr)
|
|
prm.SetStorageID(storageID)
|
|
_, err := u.mb.UpdateStorageID(ctx, prm)
|
|
return err
|
|
}
|
|
|
|
type RebuildPrm struct {
|
|
ConcurrencyLimiter RebuildWorkerLimiter
|
|
TargetFillPercent uint32
|
|
}
|
|
|
|
func (s *Shard) ScheduleRebuild(ctx context.Context, p RebuildPrm) error {
|
|
ctx, span := tracing.StartSpanFromContext(ctx, "Shard.ScheduleRebuild",
|
|
trace.WithAttributes(
|
|
attribute.String("shard_id", s.ID().String()),
|
|
attribute.Int64("target_fill_percent", int64(p.TargetFillPercent)),
|
|
))
|
|
defer span.End()
|
|
|
|
s.m.RLock()
|
|
defer s.m.RUnlock()
|
|
|
|
if s.info.Mode.ReadOnly() {
|
|
return ErrReadOnlyMode
|
|
}
|
|
if s.info.Mode.NoMetabase() {
|
|
return ErrDegradedMode
|
|
}
|
|
|
|
return s.rb.ScheduleRebuild(ctx, p.ConcurrencyLimiter, int(p.TargetFillPercent))
|
|
}
|