forked from TrueCloudLab/frostfs-node
[#1118] engine: allow to set error threshold
There are certain errors which are not expected during usual node operation and which tell us that something is wrong with the shard. To prevent possible data corruption, move shard in read-only mode after amount of errors exceeded some threshold. By default no actions are performed. Signed-off-by: Evgenii Stratonikov <evgeniy@nspcc.ru>
This commit is contained in:
parent
4f3323f084
commit
6ad2624552
13 changed files with 243 additions and 113 deletions
|
@ -6,6 +6,7 @@ import (
|
|||
"github.com/nspcc-dev/neofs-node/pkg/local_object_storage/shard"
|
||||
"github.com/nspcc-dev/neofs-node/pkg/util"
|
||||
"github.com/nspcc-dev/neofs-node/pkg/util/logger"
|
||||
"go.uber.org/atomic"
|
||||
"go.uber.org/zap"
|
||||
)
|
||||
|
||||
|
@ -15,7 +16,7 @@ type StorageEngine struct {
|
|||
|
||||
mtx *sync.RWMutex
|
||||
|
||||
shards map[string]*shard.Shard
|
||||
shards map[string]shardWrapper
|
||||
|
||||
shardPools map[string]util.WorkerPool
|
||||
|
||||
|
@ -26,12 +27,49 @@ type StorageEngine struct {
|
|||
}
|
||||
}
|
||||
|
||||
type shardWrapper struct {
|
||||
errorCount *atomic.Uint32
|
||||
*shard.Shard
|
||||
}
|
||||
|
||||
// reportShardError checks that amount of errors doesn't exceed configured threshold.
|
||||
// If it does, shard is set to read-only mode.
|
||||
func (e *StorageEngine) reportShardError(
|
||||
sh hashedShard,
|
||||
msg string,
|
||||
err error,
|
||||
fields ...zap.Field) {
|
||||
errCount := sh.errorCount.Inc()
|
||||
e.log.Warn(msg, append([]zap.Field{
|
||||
zap.Stringer("shard_id", sh.ID()),
|
||||
zap.Uint32("error count", errCount),
|
||||
zap.String("error", err.Error()),
|
||||
}, fields...)...)
|
||||
|
||||
if e.errorsThreshold == 0 || errCount < e.errorsThreshold {
|
||||
return
|
||||
}
|
||||
|
||||
err = sh.SetMode(shard.ModeReadOnly)
|
||||
if err != nil {
|
||||
e.log.Error("failed to move shard in read-only mode",
|
||||
zap.Uint32("error count", errCount),
|
||||
zap.Error(err))
|
||||
} else {
|
||||
e.log.Info("shard is moved in read-only due to error threshold",
|
||||
zap.Stringer("shard_id", sh.ID()),
|
||||
zap.Uint32("error count", errCount))
|
||||
}
|
||||
}
|
||||
|
||||
// Option represents StorageEngine's constructor option.
|
||||
type Option func(*cfg)
|
||||
|
||||
type cfg struct {
|
||||
log *logger.Logger
|
||||
|
||||
errorsThreshold uint32
|
||||
|
||||
metrics MetricRegister
|
||||
|
||||
shardPoolSize uint32
|
||||
|
@ -56,7 +94,7 @@ func New(opts ...Option) *StorageEngine {
|
|||
return &StorageEngine{
|
||||
cfg: c,
|
||||
mtx: new(sync.RWMutex),
|
||||
shards: make(map[string]*shard.Shard),
|
||||
shards: make(map[string]shardWrapper),
|
||||
shardPools: make(map[string]util.WorkerPool),
|
||||
}
|
||||
}
|
||||
|
@ -80,3 +118,11 @@ func WithShardPoolSize(sz uint32) Option {
|
|||
c.shardPoolSize = sz
|
||||
}
|
||||
}
|
||||
|
||||
// WithErrorThreshold returns an option to specify size amount of errors after which
|
||||
// shard is moved to read-only mode.
|
||||
func WithErrorThreshold(sz uint32) Option {
|
||||
return func(c *cfg) {
|
||||
c.errorsThreshold = sz
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue