frostfs-node/pkg/local_object_storage/engine/shards.go
Evgenii Stratonikov 4dc9a1b300 [#1413] engine: Remove error counting methods from Shard
All error counting and hangling logic is present on the engine level.
Currently, we pass engine metrics with shard ID metric to shard, then
export 3 methods to manipulate these metrics.
In this commits all methods are removed and error counter is tracked on
the engine level exlusively.

Signed-off-by: Evgenii Stratonikov <e.stratonikov@yadro.com>
2024-10-04 15:10:17 +03:00

444 lines
10 KiB
Go

package engine
import (
"context"
"errors"
"fmt"
"sync"
"sync/atomic"
"git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/shard"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/shard/mode"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/local_object_storage/util/logicerr"
oid "git.frostfs.info/TrueCloudLab/frostfs-sdk-go/object/id"
"git.frostfs.info/TrueCloudLab/hrw"
"github.com/google/uuid"
"github.com/panjf2000/ants/v2"
"go.uber.org/zap"
"golang.org/x/sync/errgroup"
)
var errShardNotFound = logicerr.New("shard not found")
type hashedShard struct {
shardWrapper
hash uint64
}
type metricsWithID struct {
id string
mw MetricRegister
}
func (m *metricsWithID) SetShardID(id string) {
// concurrent settings are not expected =>
// no mutex protection
m.id = id
}
func (m *metricsWithID) SetObjectCounter(objectType string, v uint64) {
m.mw.SetObjectCounter(m.id, objectType, v)
}
func (m *metricsWithID) AddToObjectCounter(objectType string, delta int) {
m.mw.AddToObjectCounter(m.id, objectType, delta)
}
func (m *metricsWithID) IncObjectCounter(objectType string) {
m.mw.AddToObjectCounter(m.id, objectType, +1)
}
func (m *metricsWithID) SetMode(mode mode.Mode) {
m.mw.SetMode(m.id, mode)
}
func (m *metricsWithID) AddToContainerSize(cnr string, size int64) {
m.mw.AddToContainerSize(cnr, size)
}
func (m *metricsWithID) AddToPayloadSize(size int64) {
m.mw.AddToPayloadCounter(m.id, size)
}
func (m *metricsWithID) IncErrorCounter() {
m.mw.IncErrorCounter(m.id)
}
func (m *metricsWithID) ClearErrorCounter() {
m.mw.ClearErrorCounter(m.id)
}
func (m *metricsWithID) DeleteShardMetrics() {
m.mw.DeleteShardMetrics(m.id)
}
func (m *metricsWithID) SetContainerObjectsCount(cnrID string, objectType string, value uint64) {
m.mw.SetContainerObjectCounter(m.id, cnrID, objectType, value)
}
func (m *metricsWithID) IncContainerObjectsCount(cnrID string, objectType string) {
m.mw.IncContainerObjectCounter(m.id, cnrID, objectType)
}
func (m *metricsWithID) SubContainerObjectsCount(cnrID string, objectType string, value uint64) {
m.mw.SubContainerObjectCounter(m.id, cnrID, objectType, value)
}
func (m *metricsWithID) IncRefillObjectsCount(path string, size int, success bool) {
m.mw.IncRefillObjectsCount(m.id, path, size, success)
}
func (m *metricsWithID) SetRefillPercent(path string, percent uint32) {
m.mw.SetRefillPercent(m.id, path, percent)
}
func (m *metricsWithID) SetRefillStatus(path string, status string) {
m.mw.SetRefillStatus(m.id, path, status)
}
func (m *metricsWithID) SetEvacuationInProgress(value bool) {
m.mw.SetEvacuationInProgress(m.id, value)
}
// AddShard adds a new shard to the storage engine.
//
// Returns any error encountered that did not allow adding a shard.
// Otherwise returns the ID of the added shard.
func (e *StorageEngine) AddShard(ctx context.Context, opts ...shard.Option) (*shard.ID, error) {
sh, err := e.createShard(ctx, opts)
if err != nil {
return nil, fmt.Errorf("could not create a shard: %w", err)
}
err = e.addShard(sh)
if err != nil {
return nil, fmt.Errorf("could not add %s shard: %w", sh.ID().String(), err)
}
e.cfg.metrics.SetMode(sh.ID().String(), sh.GetMode())
return sh.ID(), nil
}
func (e *StorageEngine) createShard(_ context.Context, opts []shard.Option) (*shard.Shard, error) {
id, err := generateShardID()
if err != nil {
return nil, fmt.Errorf("could not generate shard ID: %w", err)
}
opts = e.appendMetrics(id, opts)
sh := shard.New(append(opts,
shard.WithID(id),
shard.WithExpiredTombstonesCallback(e.processExpiredTombstones),
shard.WithExpiredLocksCallback(e.processExpiredLocks),
shard.WithDeletedLockCallback(e.processDeletedLocks),
shard.WithReportErrorFunc(e.reportShardErrorByID),
shard.WithZeroSizeCallback(e.processZeroSizeContainers),
shard.WithZeroCountCallback(e.processZeroCountContainers),
)...)
if err := sh.UpdateID(); err != nil {
e.log.Warn(logs.FailedToUpdateShardID, zap.Stringer("shard_id", sh.ID()), zap.String("metabase_path", sh.DumpInfo().MetaBaseInfo.Path), zap.Error(err))
}
return sh, nil
}
func (e *StorageEngine) appendMetrics(id *shard.ID, opts []shard.Option) []shard.Option {
e.mtx.RLock()
defer e.mtx.RUnlock()
opts = append(opts,
shard.WithMetricsWriter(
&metricsWithID{
id: id.String(),
mw: e.metrics,
},
),
shard.WithWriteCacheMetrics(
&writeCacheMetrics{
shardID: id.String(),
metrics: e.metrics.WriteCache(),
},
),
shard.WithGCMetrics(
&gcMetrics{
storage: e.metrics.GC(),
shardID: id.String(),
},
),
)
return opts
}
func (e *StorageEngine) addShard(sh *shard.Shard) error {
e.mtx.Lock()
defer e.mtx.Unlock()
pool, err := ants.NewPool(int(e.shardPoolSize), ants.WithNonblocking(true))
if err != nil {
return fmt.Errorf("could not create pool: %w", err)
}
strID := sh.ID().String()
if _, ok := e.shards[strID]; ok {
return fmt.Errorf("shard with id %s was already added", strID)
}
e.shards[strID] = hashedShard{
shardWrapper: shardWrapper{
errorCount: new(atomic.Uint32),
Shard: sh,
},
hash: hrw.StringHash(strID),
}
e.shardPools[strID] = pool
return nil
}
// removeShards removes specified shards. Skips non-existent shards.
// Logs errors about shards that it could not Close after the removal.
func (e *StorageEngine) removeShards(ids ...string) {
if len(ids) == 0 {
return
}
ss := make([]hashedShard, 0, len(ids))
e.mtx.Lock()
for _, id := range ids {
sh, found := e.shards[id]
if !found {
continue
}
e.metrics.DeleteShardMetrics(id)
ss = append(ss, sh)
delete(e.shards, id)
pool, ok := e.shardPools[id]
if ok {
pool.Release()
delete(e.shardPools, id)
}
e.log.Info(logs.EngineShardHasBeenRemoved,
zap.String("id", id))
}
e.mtx.Unlock()
for _, sh := range ss {
err := sh.SetMode(mode.Disabled)
if err != nil {
e.log.Error(logs.EngineCouldNotChangeShardModeToDisabled,
zap.Stringer("id", sh.ID()),
zap.Error(err),
)
}
err = sh.Close()
if err != nil {
e.log.Error(logs.EngineCouldNotCloseRemovedShard,
zap.Stringer("id", sh.ID()),
zap.Error(err),
)
}
}
}
func generateShardID() (*shard.ID, error) {
uid, err := uuid.NewRandom()
if err != nil {
return nil, err
}
bin, err := uid.MarshalBinary()
if err != nil {
return nil, err
}
return shard.NewIDFromBytes(bin), nil
}
func (e *StorageEngine) sortShards(objAddr interface{ EncodeToString() string }) []hashedShard {
e.mtx.RLock()
defer e.mtx.RUnlock()
h := hrw.StringHash(objAddr.EncodeToString())
shards := make([]hashedShard, 0, len(e.shards))
for _, sh := range e.shards {
shards = append(shards, hashedShard(sh))
}
hrw.SortHasherSliceByValue(shards, h)
return shards
}
func (e *StorageEngine) unsortedShards() []hashedShard {
e.mtx.RLock()
defer e.mtx.RUnlock()
shards := make([]hashedShard, 0, len(e.shards))
for _, sh := range e.shards {
shards = append(shards, hashedShard(sh))
}
return shards
}
func (e *StorageEngine) iterateOverSortedShards(addr oid.Address, handler func(int, hashedShard) (stop bool)) {
for i, sh := range e.sortShards(addr) {
if handler(i, sh) {
break
}
}
}
func (e *StorageEngine) iterateOverUnsortedShards(handler func(hashedShard) (stop bool)) {
for _, sh := range e.unsortedShards() {
if handler(sh) {
break
}
}
}
// SetShardMode sets mode of the shard with provided identifier.
//
// Returns an error if shard mode was not set, or shard was not found in storage engine.
func (e *StorageEngine) SetShardMode(id *shard.ID, m mode.Mode, resetErrorCounter bool) error {
e.mtx.RLock()
defer e.mtx.RUnlock()
for shID, sh := range e.shards {
if id.String() == shID {
if resetErrorCounter {
sh.errorCount.Store(0)
e.metrics.ClearErrorCounter(shID)
}
return sh.SetMode(m)
}
}
return errShardNotFound
}
// HandleNewEpoch notifies every shard about NewEpoch event.
func (e *StorageEngine) HandleNewEpoch(ctx context.Context, epoch uint64) {
ev := shard.EventNewEpoch(epoch)
e.mtx.RLock()
defer e.mtx.RUnlock()
for _, sh := range e.shards {
select {
case <-ctx.Done():
return
case sh.NotificationChannel() <- ev:
default:
e.log.Debug(logs.ShardEventProcessingInProgress,
zap.Uint64("epoch", epoch), zap.Stringer("shard", sh.ID()))
}
}
}
func (e *StorageEngine) DetachShards(ids []*shard.ID) error {
if len(ids) == 0 {
return logicerr.New("ids must be non-empty")
}
deletedShards, err := e.deleteShards(ids)
if err != nil {
return err
}
return e.closeShards(deletedShards)
}
// closeShards closes deleted shards. Tries to close all shards.
// Returns single error with joined shard errors.
func (e *StorageEngine) closeShards(deletedShards []hashedShard) error {
var multiErr error
var multiErrGuard sync.Mutex
var eg errgroup.Group
for _, sh := range deletedShards {
eg.Go(func() error {
err := sh.SetMode(mode.Disabled)
if err != nil {
e.log.Error(logs.EngineCouldNotChangeShardModeToDisabled,
zap.Stringer("id", sh.ID()),
zap.Error(err),
)
multiErrGuard.Lock()
multiErr = errors.Join(multiErr, fmt.Errorf("could not change shard (id:%s) mode to disabled: %w", sh.ID(), err))
multiErrGuard.Unlock()
}
err = sh.Close()
if err != nil {
e.log.Error(logs.EngineCouldNotCloseRemovedShard,
zap.Stringer("id", sh.ID()),
zap.Error(err),
)
multiErrGuard.Lock()
multiErr = errors.Join(multiErr, fmt.Errorf("could not close removed shard (id:%s): %w", sh.ID(), err))
multiErrGuard.Unlock()
}
return nil
})
}
if err := eg.Wait(); err != nil {
return err
}
return multiErr
}
// deleteShards deletes shards with specified ids from engine shard list
// and releases all engine resources associated with shards.
// Returns deleted shards or error if some shard could not be deleted.
func (e *StorageEngine) deleteShards(ids []*shard.ID) ([]hashedShard, error) {
ss := make([]hashedShard, 0, len(ids))
e.mtx.Lock()
defer e.mtx.Unlock()
for _, id := range ids {
idStr := id.String()
sh, found := e.shards[idStr]
if !found {
return nil, errShardNotFound
}
ss = append(ss, sh)
}
if len(ss) == len(e.shards) {
return nil, logicerr.New("could not delete all the shards")
}
for _, sh := range ss {
idStr := sh.ID().String()
e.metrics.DeleteShardMetrics(idStr)
delete(e.shards, idStr)
pool, ok := e.shardPools[idStr]
if ok {
pool.Release()
delete(e.shardPools, idStr)
}
e.log.Info(logs.EngineShardHasBeenRemoved,
zap.String("id", idStr))
}
return ss, nil
}
func (s hashedShard) Hash() uint64 {
return s.hash
}