forked from TrueCloudLab/frostfs-node
464 lines
11 KiB
Go
464 lines
11 KiB
Go
package shard
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"slices"
|
|
"sync"
|
|
|
|
"git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs"
|
|
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/core/object"
|
|
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/blobstor"
|
|
meta "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/metabase"
|
|
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/shard/mode"
|
|
"git.frostfs.info/TrueCloudLab/frostfs-observability/tracing"
|
|
"git.frostfs.info/TrueCloudLab/frostfs-sdk-go/client"
|
|
objectSDK "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object"
|
|
oid "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object/id"
|
|
"go.uber.org/zap"
|
|
"golang.org/x/sync/errgroup"
|
|
)
|
|
|
|
func (s *Shard) handleMetabaseFailure(ctx context.Context, stage string, err error) error {
|
|
s.log.Error(ctx, logs.ShardMetabaseFailureSwitchingMode,
|
|
zap.String("stage", stage),
|
|
zap.Stringer("mode", mode.ReadOnly),
|
|
zap.Error(err))
|
|
|
|
err = s.SetMode(ctx, mode.ReadOnly)
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
|
|
s.log.Error(ctx, logs.ShardCantMoveShardToReadonlySwitchMode,
|
|
zap.String("stage", stage),
|
|
zap.Stringer("mode", mode.DegradedReadOnly),
|
|
zap.Error(err))
|
|
|
|
err = s.SetMode(ctx, mode.DegradedReadOnly)
|
|
if err != nil {
|
|
return fmt.Errorf("switch to mode %s", mode.Mode(mode.DegradedReadOnly))
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Open opens all Shard's components.
|
|
func (s *Shard) Open(ctx context.Context) error {
|
|
components := []interface {
|
|
Open(context.Context, mode.Mode) error
|
|
}{
|
|
s.blobStor,
|
|
}
|
|
m := s.GetMode()
|
|
|
|
if !m.NoMetabase() {
|
|
components = append(components, s.metaBase)
|
|
}
|
|
|
|
if s.hasWriteCache() && !m.NoMetabase() {
|
|
components = append(components, s.writeCache)
|
|
}
|
|
|
|
if s.pilorama != nil {
|
|
components = append(components, s.pilorama)
|
|
}
|
|
|
|
for i, component := range components {
|
|
if err := component.Open(ctx, m); err != nil {
|
|
if component == s.metaBase {
|
|
// We must first open all other components to avoid
|
|
// opening non-existent DB in read-only mode.
|
|
for j := i + 1; j < len(components); j++ {
|
|
if err := components[j].Open(ctx, m); err != nil {
|
|
// Other components must be opened, fail.
|
|
return fmt.Errorf("open %T: %w", components[j], err)
|
|
}
|
|
}
|
|
err = s.handleMetabaseFailure(ctx, "open", err)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
break
|
|
}
|
|
|
|
return fmt.Errorf("open %T: %w", component, err)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
type metabaseSynchronizer Shard
|
|
|
|
func (x *metabaseSynchronizer) Init(ctx context.Context) error {
|
|
ctx, span := tracing.StartSpanFromContext(ctx, "metabaseSynchronizer.Init")
|
|
defer span.End()
|
|
|
|
return (*Shard)(x).refillMetabase(ctx)
|
|
}
|
|
|
|
// Init initializes all Shard's components.
|
|
func (s *Shard) Init(ctx context.Context) error {
|
|
m := s.GetMode()
|
|
if err := s.initializeComponents(ctx, m); err != nil {
|
|
return err
|
|
}
|
|
|
|
s.updateMetrics(ctx)
|
|
|
|
s.gc = &gc{
|
|
gcCfg: &s.gcCfg,
|
|
remover: s.removeGarbage,
|
|
stopChannel: make(chan struct{}),
|
|
eventChan: make(chan Event),
|
|
mEventHandler: map[eventType]*eventHandlers{
|
|
eventNewEpoch: {
|
|
cancelFunc: func() {},
|
|
handlers: []eventHandler{
|
|
s.collectExpiredLocks,
|
|
s.collectExpiredObjects,
|
|
s.collectExpiredTombstones,
|
|
s.collectExpiredMetrics,
|
|
},
|
|
},
|
|
},
|
|
}
|
|
if s.gc.metrics != nil {
|
|
s.gc.metrics.SetShardID(s.info.ID.String())
|
|
}
|
|
|
|
s.gc.init(ctx)
|
|
|
|
s.rb = newRebuilder()
|
|
if !m.NoMetabase() {
|
|
s.rb.Start(ctx, s.blobStor, s.metaBase, s.log)
|
|
}
|
|
s.writecacheSealCancel.Store(dummyCancel)
|
|
return nil
|
|
}
|
|
|
|
func (s *Shard) initializeComponents(ctx context.Context, m mode.Mode) error {
|
|
type initializer interface {
|
|
Init(context.Context) error
|
|
}
|
|
|
|
var components []initializer
|
|
|
|
if !m.NoMetabase() {
|
|
var initMetabase initializer
|
|
|
|
if s.NeedRefillMetabase() {
|
|
initMetabase = (*metabaseSynchronizer)(s)
|
|
} else {
|
|
initMetabase = s.metaBase
|
|
}
|
|
|
|
components = []initializer{
|
|
s.blobStor, initMetabase,
|
|
}
|
|
} else {
|
|
components = []initializer{s.blobStor}
|
|
}
|
|
|
|
if s.hasWriteCache() && !m.NoMetabase() {
|
|
components = append(components, s.writeCache)
|
|
}
|
|
|
|
if s.pilorama != nil {
|
|
components = append(components, s.pilorama)
|
|
}
|
|
|
|
for _, component := range components {
|
|
if err := component.Init(ctx); err != nil {
|
|
if component == s.metaBase {
|
|
if errors.Is(err, meta.ErrOutdatedVersion) || errors.Is(err, meta.ErrIncompletedUpgrade) {
|
|
return fmt.Errorf("metabase initialization: %w", err)
|
|
}
|
|
|
|
err = s.handleMetabaseFailure(ctx, "init", err)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
break
|
|
}
|
|
|
|
return fmt.Errorf("initialize %T: %w", component, err)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *Shard) refillMetabase(ctx context.Context) error {
|
|
path := s.metaBase.DumpInfo().Path
|
|
s.metricsWriter.SetRefillStatus(path, "running")
|
|
s.metricsWriter.SetRefillPercent(path, 0)
|
|
var success bool
|
|
defer func() {
|
|
if success {
|
|
s.metricsWriter.SetRefillStatus(path, "completed")
|
|
} else {
|
|
s.metricsWriter.SetRefillStatus(path, "failed")
|
|
}
|
|
}()
|
|
|
|
err := s.metaBase.Reset()
|
|
if err != nil {
|
|
return fmt.Errorf("reset metabase: %w", err)
|
|
}
|
|
|
|
withCount := true
|
|
totalObjects, err := s.blobStor.ObjectsCount(ctx)
|
|
if err != nil {
|
|
s.log.Warn(ctx, logs.EngineRefillFailedToGetObjectsCount, zap.Error(err))
|
|
withCount = false
|
|
}
|
|
|
|
eg, egCtx := errgroup.WithContext(ctx)
|
|
if s.cfg.refillMetabaseWorkersCount > 0 {
|
|
eg.SetLimit(s.cfg.refillMetabaseWorkersCount)
|
|
}
|
|
|
|
var completedCount uint64
|
|
var metricGuard sync.Mutex
|
|
itErr := blobstor.IterateBinaryObjects(egCtx, s.blobStor, func(addr oid.Address, data []byte, descriptor []byte) error {
|
|
eg.Go(func() error {
|
|
var success bool
|
|
defer func() {
|
|
s.metricsWriter.IncRefillObjectsCount(path, len(data), success)
|
|
if withCount {
|
|
metricGuard.Lock()
|
|
completedCount++
|
|
s.metricsWriter.SetRefillPercent(path, uint32(completedCount*100/totalObjects))
|
|
metricGuard.Unlock()
|
|
}
|
|
}()
|
|
|
|
if err := s.refillObject(egCtx, data, addr, descriptor); err != nil {
|
|
return err
|
|
}
|
|
success = true
|
|
return nil
|
|
})
|
|
|
|
select {
|
|
case <-egCtx.Done():
|
|
return egCtx.Err()
|
|
default:
|
|
return nil
|
|
}
|
|
})
|
|
|
|
egErr := eg.Wait()
|
|
|
|
err = errors.Join(egErr, itErr)
|
|
if err != nil {
|
|
return fmt.Errorf("put objects to the meta: %w", err)
|
|
}
|
|
|
|
err = s.metaBase.SyncCounters()
|
|
if err != nil {
|
|
return fmt.Errorf("sync object counters: %w", err)
|
|
}
|
|
|
|
success = true
|
|
s.metricsWriter.SetRefillPercent(path, 100)
|
|
return nil
|
|
}
|
|
|
|
func (s *Shard) refillObject(ctx context.Context, data []byte, addr oid.Address, descriptor []byte) error {
|
|
obj := objectSDK.New()
|
|
if err := obj.Unmarshal(data); err != nil {
|
|
s.log.Warn(ctx, logs.ShardCouldNotUnmarshalObject,
|
|
zap.Stringer("address", addr),
|
|
zap.Error(err))
|
|
return nil
|
|
}
|
|
|
|
hasIndexedAttribute := slices.IndexFunc(obj.Attributes(), func(attr objectSDK.Attribute) bool { return meta.IsAtrributeIndexed(attr.Key()) }) > 0
|
|
|
|
var isIndexedContainer bool
|
|
if hasIndexedAttribute {
|
|
info, err := s.containerInfo.Info(addr.Container())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if info.Removed {
|
|
s.log.Debug(ctx, logs.ShardSkipObjectFromResyncContainerDeleted, zap.Stringer("address", addr))
|
|
return nil
|
|
}
|
|
isIndexedContainer = info.Indexed
|
|
}
|
|
|
|
var err error
|
|
switch obj.Type() {
|
|
case objectSDK.TypeTombstone:
|
|
err = s.refillTombstoneObject(ctx, obj)
|
|
case objectSDK.TypeLock:
|
|
err = s.refillLockObject(ctx, obj)
|
|
default:
|
|
}
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
var mPrm meta.PutPrm
|
|
mPrm.SetObject(obj)
|
|
mPrm.SetStorageID(descriptor)
|
|
mPrm.SetIndexAttributes(hasIndexedAttribute && isIndexedContainer)
|
|
|
|
_, err = s.metaBase.Put(ctx, mPrm)
|
|
if err != nil && !client.IsErrObjectAlreadyRemoved(err) && !errors.Is(err, meta.ErrObjectIsExpired) {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *Shard) refillLockObject(ctx context.Context, obj *objectSDK.Object) error {
|
|
var lock objectSDK.Lock
|
|
if err := lock.Unmarshal(obj.Payload()); err != nil {
|
|
return fmt.Errorf("unmarshal lock content: %w", err)
|
|
}
|
|
|
|
locked := make([]oid.ID, lock.NumberOfMembers())
|
|
lock.ReadMembers(locked)
|
|
|
|
cnr, _ := obj.ContainerID()
|
|
id, _ := obj.ID()
|
|
err := s.metaBase.Lock(ctx, cnr, id, locked)
|
|
if err != nil {
|
|
return fmt.Errorf("lock objects: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (s *Shard) refillTombstoneObject(ctx context.Context, obj *objectSDK.Object) error {
|
|
tombstone := objectSDK.NewTombstone()
|
|
|
|
if err := tombstone.Unmarshal(obj.Payload()); err != nil {
|
|
return fmt.Errorf("unmarshal tombstone content: %w", err)
|
|
}
|
|
|
|
tombAddr := object.AddressOf(obj)
|
|
memberIDs := tombstone.Members()
|
|
tombMembers := make([]oid.Address, 0, len(memberIDs))
|
|
|
|
for i := range memberIDs {
|
|
a := tombAddr
|
|
a.SetObject(memberIDs[i])
|
|
|
|
tombMembers = append(tombMembers, a)
|
|
}
|
|
|
|
var inhumePrm meta.InhumePrm
|
|
|
|
inhumePrm.SetTombstoneAddress(tombAddr)
|
|
inhumePrm.SetAddresses(tombMembers...)
|
|
|
|
_, err := s.metaBase.Inhume(ctx, inhumePrm)
|
|
if err != nil {
|
|
return fmt.Errorf("inhume objects: %w", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Close releases all Shard's components.
|
|
func (s *Shard) Close(ctx context.Context) error {
|
|
if s.rb != nil {
|
|
s.rb.Stop(ctx, s.log)
|
|
}
|
|
var components []interface{ Close(context.Context) error }
|
|
|
|
if s.pilorama != nil {
|
|
components = append(components, s.pilorama)
|
|
}
|
|
|
|
if s.hasWriteCache() {
|
|
prev := s.writecacheSealCancel.Swap(notInitializedCancel)
|
|
prev.cancel() // no need to wait: writecache.Seal and writecache.Close lock the same mutex
|
|
components = append(components, s.writeCache)
|
|
}
|
|
|
|
components = append(components, s.blobStor, s.metaBase)
|
|
|
|
var lastErr error
|
|
for _, component := range components {
|
|
if err := component.Close(ctx); err != nil {
|
|
lastErr = err
|
|
s.log.Error(ctx, logs.ShardCouldNotCloseShardComponent, zap.Error(err))
|
|
}
|
|
}
|
|
|
|
// If Init/Open was unsuccessful gc can be nil.
|
|
if s.gc != nil {
|
|
s.gc.stop(ctx)
|
|
}
|
|
|
|
return lastErr
|
|
}
|
|
|
|
// Reload reloads configuration portions that are necessary.
|
|
// If a config option is invalid, it logs an error and returns nil.
|
|
// If there was a problem with applying new configuration, an error is returned.
|
|
func (s *Shard) Reload(ctx context.Context, opts ...Option) error {
|
|
ctx, span := tracing.StartSpanFromContext(ctx, "Shard.Reload")
|
|
defer span.End()
|
|
|
|
// Do not use defaultCfg here missing options need not be reloaded.
|
|
var c cfg
|
|
for i := range opts {
|
|
opts[i](&c)
|
|
}
|
|
|
|
unlock := s.lockExclusive()
|
|
defer unlock()
|
|
|
|
s.rb.Stop(ctx, s.log)
|
|
if !s.info.Mode.NoMetabase() {
|
|
defer func() {
|
|
s.rb.Start(ctx, s.blobStor, s.metaBase, s.log)
|
|
}()
|
|
}
|
|
|
|
ok, err := s.metaBase.Reload(ctx, c.metaOpts...)
|
|
if err != nil {
|
|
if errors.Is(err, meta.ErrDegradedMode) {
|
|
s.log.Error(ctx, logs.ShardCantOpenMetabaseMoveToADegradedMode, zap.Error(err))
|
|
_ = s.setMode(ctx, mode.DegradedReadOnly)
|
|
}
|
|
return err
|
|
}
|
|
if ok {
|
|
var err error
|
|
if c.refillMetabase {
|
|
// Here we refill metabase only if a new instance was opened. This is a feature,
|
|
// we don't want to hang for some time just because we forgot to change
|
|
// config after the node was updated.
|
|
err = s.refillMetabase(ctx)
|
|
} else {
|
|
err = s.metaBase.Init(ctx)
|
|
}
|
|
if err != nil {
|
|
s.log.Error(ctx, logs.ShardCantInitializeMetabaseMoveToADegradedreadonlyMode, zap.Error(err))
|
|
_ = s.setMode(ctx, mode.DegradedReadOnly)
|
|
return err
|
|
}
|
|
}
|
|
return s.setMode(ctx, c.info.Mode)
|
|
}
|
|
|
|
func (s *Shard) lockExclusive() func() {
|
|
s.setModeRequested.Store(true)
|
|
val := s.gcCancel.Load()
|
|
if val != nil {
|
|
cancelGC := val.(context.CancelFunc)
|
|
cancelGC()
|
|
}
|
|
if c := s.writecacheSealCancel.Load(); c != nil {
|
|
c.cancel()
|
|
}
|
|
s.m.Lock()
|
|
s.setModeRequested.Store(false)
|
|
return s.m.Unlock
|
|
}
|