forked from TrueCloudLab/frostfs-node
[#1818] writecache: Increase error counter on background errors
Signed-off-by: Evgenii Stratonikov <evgeniy@morphbits.ru>
This commit is contained in:
parent
bffb0f894c
commit
777fd32d4f
7 changed files with 105 additions and 48 deletions
|
@ -13,6 +13,7 @@ Changelog for NeoFS Node
|
||||||
### Fixed
|
### Fixed
|
||||||
- Open FSTree in sync mode by default (#1992)
|
- Open FSTree in sync mode by default (#1992)
|
||||||
- `neofs-cli container nodes`'s output (#1991)
|
- `neofs-cli container nodes`'s output (#1991)
|
||||||
|
- Increase error counter for write-cache flush errors (#1818)
|
||||||
|
|
||||||
### Removed
|
### Removed
|
||||||
### Updated
|
### Updated
|
||||||
|
|
|
@ -35,6 +35,25 @@ type shardWrapper struct {
|
||||||
*shard.Shard
|
*shard.Shard
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// reportShardErrorBackground increases shard error counter and logs an error.
|
||||||
|
// It is intended to be used from background workers and
|
||||||
|
// doesn't change shard mode because of possible deadlocks.
|
||||||
|
func (e *StorageEngine) reportShardErrorBackground(id string, msg string, err error) {
|
||||||
|
e.mtx.RLock()
|
||||||
|
sh, ok := e.shards[id]
|
||||||
|
e.mtx.RUnlock()
|
||||||
|
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
errCount := sh.errorCount.Inc()
|
||||||
|
e.log.Warn(msg,
|
||||||
|
zap.String("shard_id", id),
|
||||||
|
zap.Uint32("error count", errCount),
|
||||||
|
zap.String("error", err.Error()))
|
||||||
|
}
|
||||||
|
|
||||||
// reportShardError checks that the amount of errors doesn't exceed the configured threshold.
|
// reportShardError checks that the amount of errors doesn't exceed the configured threshold.
|
||||||
// If it does, shard is set to read-only mode.
|
// If it does, shard is set to read-only mode.
|
||||||
func (e *StorageEngine) reportShardError(
|
func (e *StorageEngine) reportShardError(
|
||||||
|
|
|
@ -87,6 +87,7 @@ func (e *StorageEngine) createShard(opts []shard.Option) (*shard.Shard, error) {
|
||||||
shard.WithExpiredTombstonesCallback(e.processExpiredTombstones),
|
shard.WithExpiredTombstonesCallback(e.processExpiredTombstones),
|
||||||
shard.WithExpiredLocksCallback(e.processExpiredLocks),
|
shard.WithExpiredLocksCallback(e.processExpiredLocks),
|
||||||
shard.WithDeletedLockCallback(e.processDeletedLocks),
|
shard.WithDeletedLockCallback(e.processDeletedLocks),
|
||||||
|
shard.WithReportErrorFunc(e.reportShardErrorBackground),
|
||||||
)...)
|
)...)
|
||||||
|
|
||||||
if err := sh.UpdateID(); err != nil {
|
if err := sh.UpdateID(); err != nil {
|
||||||
|
|
|
@ -96,6 +96,8 @@ type cfg struct {
|
||||||
tsSource TombstoneSource
|
tsSource TombstoneSource
|
||||||
|
|
||||||
metricsWriter MetricsWriter
|
metricsWriter MetricsWriter
|
||||||
|
|
||||||
|
reportErrorFunc func(selfID string, message string, err error)
|
||||||
}
|
}
|
||||||
|
|
||||||
func defaultCfg() *cfg {
|
func defaultCfg() *cfg {
|
||||||
|
@ -103,6 +105,7 @@ func defaultCfg() *cfg {
|
||||||
rmBatchSize: 100,
|
rmBatchSize: 100,
|
||||||
log: &logger.Logger{Logger: zap.L()},
|
log: &logger.Logger{Logger: zap.L()},
|
||||||
gcCfg: defaultGCCfg(),
|
gcCfg: defaultGCCfg(),
|
||||||
|
reportErrorFunc: func(string, string, error) {},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -117,22 +120,23 @@ func New(opts ...Option) *Shard {
|
||||||
bs := blobstor.New(c.blobOpts...)
|
bs := blobstor.New(c.blobOpts...)
|
||||||
mb := meta.New(c.metaOpts...)
|
mb := meta.New(c.metaOpts...)
|
||||||
|
|
||||||
var writeCache writecache.Cache
|
|
||||||
if c.useWriteCache {
|
|
||||||
writeCache = writecache.New(
|
|
||||||
append(c.writeCacheOpts,
|
|
||||||
writecache.WithBlobstor(bs),
|
|
||||||
writecache.WithMetabase(mb))...)
|
|
||||||
}
|
|
||||||
|
|
||||||
s := &Shard{
|
s := &Shard{
|
||||||
cfg: c,
|
cfg: c,
|
||||||
blobStor: bs,
|
blobStor: bs,
|
||||||
metaBase: mb,
|
metaBase: mb,
|
||||||
writeCache: writeCache,
|
|
||||||
tsSource: c.tsSource,
|
tsSource: c.tsSource,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if c.useWriteCache {
|
||||||
|
s.writeCache = writecache.New(
|
||||||
|
append(c.writeCacheOpts,
|
||||||
|
writecache.WithReportErrorFunc(func(msg string, err error) {
|
||||||
|
s.reportErrorFunc(s.ID().String(), msg, err)
|
||||||
|
}),
|
||||||
|
writecache.WithBlobstor(bs),
|
||||||
|
writecache.WithMetabase(mb))...)
|
||||||
|
}
|
||||||
|
|
||||||
if s.piloramaOpts != nil {
|
if s.piloramaOpts != nil {
|
||||||
s.pilorama = pilorama.NewBoltForest(c.piloramaOpts...)
|
s.pilorama = pilorama.NewBoltForest(c.piloramaOpts...)
|
||||||
}
|
}
|
||||||
|
@ -281,6 +285,14 @@ func WithMetricsWriter(v MetricsWriter) Option {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// WithReportErrorFunc returns option to specify callback for handling storage-related errors
|
||||||
|
// in the background workers.
|
||||||
|
func WithReportErrorFunc(f func(selfID string, message string, err error)) Option {
|
||||||
|
return func(c *cfg) {
|
||||||
|
c.reportErrorFunc = f
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (s *Shard) fillInfo() {
|
func (s *Shard) fillInfo() {
|
||||||
s.cfg.info.MetaBaseInfo = s.metaBase.DumpInfo()
|
s.cfg.info.MetaBaseInfo = s.metaBase.DumpInfo()
|
||||||
s.cfg.info.BlobStorInfo = s.blobStor.DumpInfo()
|
s.cfg.info.BlobStorInfo = s.blobStor.DumpInfo()
|
||||||
|
|
|
@ -7,6 +7,7 @@ import (
|
||||||
"github.com/mr-tron/base58"
|
"github.com/mr-tron/base58"
|
||||||
"github.com/nspcc-dev/neo-go/pkg/util/slice"
|
"github.com/nspcc-dev/neo-go/pkg/util/slice"
|
||||||
objectCore "github.com/nspcc-dev/neofs-node/pkg/core/object"
|
objectCore "github.com/nspcc-dev/neofs-node/pkg/core/object"
|
||||||
|
"github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor"
|
||||||
"github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/common"
|
"github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/common"
|
||||||
meta "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/metabase"
|
meta "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/metabase"
|
||||||
"github.com/nspcc-dev/neofs-sdk-go/object"
|
"github.com/nspcc-dev/neofs-sdk-go/object"
|
||||||
|
@ -146,6 +147,16 @@ func (c *cache) flushBigObjects() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *cache) reportFlushError(msg string, addr string, err error) {
|
||||||
|
if c.reportError != nil {
|
||||||
|
c.reportError(msg, err)
|
||||||
|
} else {
|
||||||
|
c.log.Error(msg,
|
||||||
|
zap.String("address", addr),
|
||||||
|
zap.Error(err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (c *cache) flushFSTree(ignoreErrors bool) error {
|
func (c *cache) flushFSTree(ignoreErrors bool) error {
|
||||||
var prm common.IteratePrm
|
var prm common.IteratePrm
|
||||||
prm.IgnoreErrors = ignoreErrors
|
prm.IgnoreErrors = ignoreErrors
|
||||||
|
@ -158,8 +169,8 @@ func (c *cache) flushFSTree(ignoreErrors bool) error {
|
||||||
|
|
||||||
data, err := f()
|
data, err := f()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
c.reportFlushError("can't read a file", sAddr, err)
|
||||||
if ignoreErrors {
|
if ignoreErrors {
|
||||||
c.log.Error("can't read a file", zap.Stringer("address", addr))
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
return err
|
return err
|
||||||
|
@ -168,37 +179,21 @@ func (c *cache) flushFSTree(ignoreErrors bool) error {
|
||||||
var obj object.Object
|
var obj object.Object
|
||||||
err = obj.Unmarshal(data)
|
err = obj.Unmarshal(data)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
c.reportFlushError("can't unmarshal an object", sAddr, err)
|
||||||
if ignoreErrors {
|
if ignoreErrors {
|
||||||
c.log.Error("can't unmarshal an object", zap.Stringer("address", addr))
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
var prm common.PutPrm
|
err = c.flushObject(&obj, data)
|
||||||
prm.Address = addr
|
|
||||||
prm.Object = &obj
|
|
||||||
prm.RawData = data
|
|
||||||
|
|
||||||
res, err := c.blobstor.Put(prm)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if ignoreErrors {
|
if ignoreErrors {
|
||||||
c.log.Error("cant flush object to blobstor", zap.Error(err))
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
var updPrm meta.UpdateStorageIDPrm
|
|
||||||
updPrm.SetAddress(addr)
|
|
||||||
updPrm.SetStorageID(res.StorageID)
|
|
||||||
|
|
||||||
_, err = c.metabase.UpdateStorageID(updPrm)
|
|
||||||
if err != nil {
|
|
||||||
c.log.Error("failed to update storage ID in metabase", zap.Error(err))
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// mark object as flushed
|
// mark object as flushed
|
||||||
c.flushed.Add(sAddr, false)
|
c.flushed.Add(sAddr, false)
|
||||||
|
|
||||||
|
@ -222,30 +217,40 @@ func (c *cache) flushWorker(_ int) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
err := c.flushObject(obj)
|
err := c.flushObject(obj, nil)
|
||||||
if err != nil {
|
if err == nil {
|
||||||
c.log.Error("can't flush object to the main storage", zap.Error(err))
|
|
||||||
} else {
|
|
||||||
c.flushed.Add(objectCore.AddressOf(obj).EncodeToString(), true)
|
c.flushed.Add(objectCore.AddressOf(obj).EncodeToString(), true)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// flushObject is used to write object directly to the main storage.
|
// flushObject is used to write object directly to the main storage.
|
||||||
func (c *cache) flushObject(obj *object.Object) error {
|
func (c *cache) flushObject(obj *object.Object, data []byte) error {
|
||||||
|
addr := objectCore.AddressOf(obj)
|
||||||
|
|
||||||
var prm common.PutPrm
|
var prm common.PutPrm
|
||||||
prm.Object = obj
|
prm.Object = obj
|
||||||
|
prm.RawData = data
|
||||||
|
|
||||||
res, err := c.blobstor.Put(prm)
|
res, err := c.blobstor.Put(prm)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
if !errors.Is(err, common.ErrNoSpace) && !errors.Is(err, common.ErrReadOnly) &&
|
||||||
|
!errors.Is(err, blobstor.ErrNoPlaceFound) {
|
||||||
|
c.reportFlushError("can't flush an object to blobstor",
|
||||||
|
addr.EncodeToString(), err)
|
||||||
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
var updPrm meta.UpdateStorageIDPrm
|
var updPrm meta.UpdateStorageIDPrm
|
||||||
updPrm.SetAddress(objectCore.AddressOf(obj))
|
updPrm.SetAddress(addr)
|
||||||
updPrm.SetStorageID(res.StorageID)
|
updPrm.SetStorageID(res.StorageID)
|
||||||
|
|
||||||
_, err = c.metabase.UpdateStorageID(updPrm)
|
_, err = c.metabase.UpdateStorageID(updPrm)
|
||||||
|
if err != nil {
|
||||||
|
c.reportFlushError("can't update object storage ID",
|
||||||
|
addr.EncodeToString(), err)
|
||||||
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -280,6 +285,7 @@ func (c *cache) flush(ignoreErrors bool) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := addr.DecodeString(sa); err != nil {
|
if err := addr.DecodeString(sa); err != nil {
|
||||||
|
c.reportFlushError("can't decode object address from the DB", sa, err)
|
||||||
if ignoreErrors {
|
if ignoreErrors {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
@ -288,13 +294,14 @@ func (c *cache) flush(ignoreErrors bool) error {
|
||||||
|
|
||||||
var obj object.Object
|
var obj object.Object
|
||||||
if err := obj.Unmarshal(data); err != nil {
|
if err := obj.Unmarshal(data); err != nil {
|
||||||
|
c.reportFlushError("can't unmarshal an object from the DB", sa, err)
|
||||||
if ignoreErrors {
|
if ignoreErrors {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := c.flushObject(&obj); err != nil {
|
if err := c.flushObject(&obj, data); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,7 @@ import (
|
||||||
versionSDK "github.com/nspcc-dev/neofs-sdk-go/version"
|
versionSDK "github.com/nspcc-dev/neofs-sdk-go/version"
|
||||||
"github.com/stretchr/testify/require"
|
"github.com/stretchr/testify/require"
|
||||||
"go.etcd.io/bbolt"
|
"go.etcd.io/bbolt"
|
||||||
|
"go.uber.org/atomic"
|
||||||
"go.uber.org/zap/zaptest"
|
"go.uber.org/zap/zaptest"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -35,7 +36,7 @@ func TestFlush(t *testing.T) {
|
||||||
obj *object.Object
|
obj *object.Object
|
||||||
}
|
}
|
||||||
|
|
||||||
newCache := func(t *testing.T) (Cache, *blobstor.BlobStor, *meta.DB) {
|
newCache := func(t *testing.T, opts ...Option) (Cache, *blobstor.BlobStor, *meta.DB) {
|
||||||
dir := t.TempDir()
|
dir := t.TempDir()
|
||||||
mb := meta.New(
|
mb := meta.New(
|
||||||
meta.WithPath(filepath.Join(dir, "meta")),
|
meta.WithPath(filepath.Join(dir, "meta")),
|
||||||
|
@ -54,11 +55,13 @@ func TestFlush(t *testing.T) {
|
||||||
require.NoError(t, bs.Init())
|
require.NoError(t, bs.Init())
|
||||||
|
|
||||||
wc := New(
|
wc := New(
|
||||||
|
append([]Option{
|
||||||
WithLogger(&logger.Logger{Logger: zaptest.NewLogger(t)}),
|
WithLogger(&logger.Logger{Logger: zaptest.NewLogger(t)}),
|
||||||
WithPath(filepath.Join(dir, "writecache")),
|
WithPath(filepath.Join(dir, "writecache")),
|
||||||
WithSmallObjectSize(smallSize),
|
WithSmallObjectSize(smallSize),
|
||||||
WithMetabase(mb),
|
WithMetabase(mb),
|
||||||
WithBlobstor(bs))
|
WithBlobstor(bs),
|
||||||
|
}, opts...)...)
|
||||||
require.NoError(t, wc.Open(false))
|
require.NoError(t, wc.Open(false))
|
||||||
require.NoError(t, wc.Init())
|
require.NoError(t, wc.Init())
|
||||||
|
|
||||||
|
@ -164,7 +167,10 @@ func TestFlush(t *testing.T) {
|
||||||
|
|
||||||
t.Run("ignore errors", func(t *testing.T) {
|
t.Run("ignore errors", func(t *testing.T) {
|
||||||
testIgnoreErrors := func(t *testing.T, f func(*cache)) {
|
testIgnoreErrors := func(t *testing.T, f func(*cache)) {
|
||||||
wc, bs, mb := newCache(t)
|
var errCount atomic.Uint32
|
||||||
|
wc, bs, mb := newCache(t, WithReportErrorFunc(func(message string, err error) {
|
||||||
|
errCount.Inc()
|
||||||
|
}))
|
||||||
objects := putObjects(t, wc)
|
objects := putObjects(t, wc)
|
||||||
f(wc.(*cache))
|
f(wc.(*cache))
|
||||||
|
|
||||||
|
@ -172,7 +178,9 @@ func TestFlush(t *testing.T) {
|
||||||
require.NoError(t, bs.SetMode(mode.ReadWrite))
|
require.NoError(t, bs.SetMode(mode.ReadWrite))
|
||||||
require.NoError(t, mb.SetMode(mode.ReadWrite))
|
require.NoError(t, mb.SetMode(mode.ReadWrite))
|
||||||
|
|
||||||
|
require.Equal(t, uint32(0), errCount.Load())
|
||||||
require.Error(t, wc.Flush(false))
|
require.Error(t, wc.Flush(false))
|
||||||
|
require.True(t, errCount.Load() > 0)
|
||||||
require.NoError(t, wc.Flush(true))
|
require.NoError(t, wc.Flush(true))
|
||||||
|
|
||||||
check(t, mb, bs, objects)
|
check(t, mb, bs, objects)
|
||||||
|
|
|
@ -52,6 +52,8 @@ type options struct {
|
||||||
maxBatchDelay time.Duration
|
maxBatchDelay time.Duration
|
||||||
// noSync is true iff FSTree allows unsynchronized writes.
|
// noSync is true iff FSTree allows unsynchronized writes.
|
||||||
noSync bool
|
noSync bool
|
||||||
|
// reportError is the function called when encountering disk errors in background workers.
|
||||||
|
reportError func(string, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
// WithLogger sets logger.
|
// WithLogger sets logger.
|
||||||
|
@ -142,3 +144,10 @@ func WithNoSync(noSync bool) Option {
|
||||||
o.noSync = noSync
|
o.noSync = noSync
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// WithReportErrorFunc sets error reporting function.
|
||||||
|
func WithReportErrorFunc(f func(string, error)) Option {
|
||||||
|
return func(o *options) {
|
||||||
|
o.reportError = f
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue