[#1818] writecache: Increase error counter on background errors

Signed-off-by: Evgenii Stratonikov <evgeniy@morphbits.ru>
This commit is contained in:
Evgenii Stratonikov 2022-10-20 13:40:25 +03:00 committed by fyrchik
parent a4a6d547a8
commit d2cce62934
7 changed files with 105 additions and 48 deletions

View file

@ -13,6 +13,7 @@ Changelog for NeoFS Node
- Do not panic with bad inputs for `GET_RANGE` (#2007) - Do not panic with bad inputs for `GET_RANGE` (#2007)
- Correctly select the shard for applying tree service operations (#1996) - Correctly select the shard for applying tree service operations (#1996)
- Physical child object removal by GC (#1699) - Physical child object removal by GC (#1699)
- Increase error counter for write-cache flush errors (#1818)
### Removed ### Removed
### Updated ### Updated

View file

@ -35,6 +35,25 @@ type shardWrapper struct {
*shard.Shard *shard.Shard
} }
// reportShardErrorBackground increases shard error counter and logs an error.
// It is intended to be used from background workers and
// doesn't change shard mode because of possible deadlocks.
func (e *StorageEngine) reportShardErrorBackground(id string, msg string, err error) {
e.mtx.RLock()
sh, ok := e.shards[id]
e.mtx.RUnlock()
if !ok {
return
}
errCount := sh.errorCount.Inc()
e.log.Warn(msg,
zap.String("shard_id", id),
zap.Uint32("error count", errCount),
zap.String("error", err.Error()))
}
// reportShardError checks that the amount of errors doesn't exceed the configured threshold. // reportShardError checks that the amount of errors doesn't exceed the configured threshold.
// If it does, shard is set to read-only mode. // If it does, shard is set to read-only mode.
func (e *StorageEngine) reportShardError( func (e *StorageEngine) reportShardError(

View file

@ -87,6 +87,7 @@ func (e *StorageEngine) createShard(opts []shard.Option) (*shard.Shard, error) {
shard.WithExpiredTombstonesCallback(e.processExpiredTombstones), shard.WithExpiredTombstonesCallback(e.processExpiredTombstones),
shard.WithExpiredLocksCallback(e.processExpiredLocks), shard.WithExpiredLocksCallback(e.processExpiredLocks),
shard.WithDeletedLockCallback(e.processDeletedLocks), shard.WithDeletedLockCallback(e.processDeletedLocks),
shard.WithReportErrorFunc(e.reportShardErrorBackground),
)...) )...)
if err := sh.UpdateID(); err != nil { if err := sh.UpdateID(); err != nil {

View file

@ -96,6 +96,8 @@ type cfg struct {
tsSource TombstoneSource tsSource TombstoneSource
metricsWriter MetricsWriter metricsWriter MetricsWriter
reportErrorFunc func(selfID string, message string, err error)
} }
func defaultCfg() *cfg { func defaultCfg() *cfg {
@ -103,6 +105,7 @@ func defaultCfg() *cfg {
rmBatchSize: 100, rmBatchSize: 100,
log: &logger.Logger{Logger: zap.L()}, log: &logger.Logger{Logger: zap.L()},
gcCfg: defaultGCCfg(), gcCfg: defaultGCCfg(),
reportErrorFunc: func(string, string, error) {},
} }
} }
@ -117,22 +120,23 @@ func New(opts ...Option) *Shard {
bs := blobstor.New(c.blobOpts...) bs := blobstor.New(c.blobOpts...)
mb := meta.New(c.metaOpts...) mb := meta.New(c.metaOpts...)
var writeCache writecache.Cache
if c.useWriteCache {
writeCache = writecache.New(
append(c.writeCacheOpts,
writecache.WithBlobstor(bs),
writecache.WithMetabase(mb))...)
}
s := &Shard{ s := &Shard{
cfg: c, cfg: c,
blobStor: bs, blobStor: bs,
metaBase: mb, metaBase: mb,
writeCache: writeCache,
tsSource: c.tsSource, tsSource: c.tsSource,
} }
if c.useWriteCache {
s.writeCache = writecache.New(
append(c.writeCacheOpts,
writecache.WithReportErrorFunc(func(msg string, err error) {
s.reportErrorFunc(s.ID().String(), msg, err)
}),
writecache.WithBlobstor(bs),
writecache.WithMetabase(mb))...)
}
if s.piloramaOpts != nil { if s.piloramaOpts != nil {
s.pilorama = pilorama.NewBoltForest(c.piloramaOpts...) s.pilorama = pilorama.NewBoltForest(c.piloramaOpts...)
} }
@ -281,6 +285,14 @@ func WithMetricsWriter(v MetricsWriter) Option {
} }
} }
// WithReportErrorFunc returns option to specify callback for handling storage-related errors
// in the background workers.
func WithReportErrorFunc(f func(selfID string, message string, err error)) Option {
return func(c *cfg) {
c.reportErrorFunc = f
}
}
func (s *Shard) fillInfo() { func (s *Shard) fillInfo() {
s.cfg.info.MetaBaseInfo = s.metaBase.DumpInfo() s.cfg.info.MetaBaseInfo = s.metaBase.DumpInfo()
s.cfg.info.BlobStorInfo = s.blobStor.DumpInfo() s.cfg.info.BlobStorInfo = s.blobStor.DumpInfo()

View file

@ -7,6 +7,7 @@ import (
"github.com/mr-tron/base58" "github.com/mr-tron/base58"
"github.com/nspcc-dev/neo-go/pkg/util/slice" "github.com/nspcc-dev/neo-go/pkg/util/slice"
objectCore "github.com/nspcc-dev/neofs-node/pkg/core/object" objectCore "github.com/nspcc-dev/neofs-node/pkg/core/object"
"github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor"
"github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/common" "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/common"
meta "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/metabase" meta "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/metabase"
"github.com/nspcc-dev/neofs-sdk-go/object" "github.com/nspcc-dev/neofs-sdk-go/object"
@ -146,6 +147,16 @@ func (c *cache) flushBigObjects() {
} }
} }
func (c *cache) reportFlushError(msg string, addr string, err error) {
if c.reportError != nil {
c.reportError(msg, err)
} else {
c.log.Error(msg,
zap.String("address", addr),
zap.Error(err))
}
}
func (c *cache) flushFSTree(ignoreErrors bool) error { func (c *cache) flushFSTree(ignoreErrors bool) error {
var prm common.IteratePrm var prm common.IteratePrm
prm.IgnoreErrors = ignoreErrors prm.IgnoreErrors = ignoreErrors
@ -158,8 +169,8 @@ func (c *cache) flushFSTree(ignoreErrors bool) error {
data, err := f() data, err := f()
if err != nil { if err != nil {
c.reportFlushError("can't read a file", sAddr, err)
if ignoreErrors { if ignoreErrors {
c.log.Error("can't read a file", zap.Stringer("address", addr))
return nil return nil
} }
return err return err
@ -168,37 +179,21 @@ func (c *cache) flushFSTree(ignoreErrors bool) error {
var obj object.Object var obj object.Object
err = obj.Unmarshal(data) err = obj.Unmarshal(data)
if err != nil { if err != nil {
c.reportFlushError("can't unmarshal an object", sAddr, err)
if ignoreErrors { if ignoreErrors {
c.log.Error("can't unmarshal an object", zap.Stringer("address", addr))
return nil return nil
} }
return err return err
} }
var prm common.PutPrm err = c.flushObject(&obj, data)
prm.Address = addr
prm.Object = &obj
prm.RawData = data
res, err := c.blobstor.Put(prm)
if err != nil { if err != nil {
if ignoreErrors { if ignoreErrors {
c.log.Error("cant flush object to blobstor", zap.Error(err))
return nil return nil
} }
return err return err
} }
var updPrm meta.UpdateStorageIDPrm
updPrm.SetAddress(addr)
updPrm.SetStorageID(res.StorageID)
_, err = c.metabase.UpdateStorageID(updPrm)
if err != nil {
c.log.Error("failed to update storage ID in metabase", zap.Error(err))
return nil
}
// mark object as flushed // mark object as flushed
c.flushed.Add(sAddr, false) c.flushed.Add(sAddr, false)
@ -222,30 +217,40 @@ func (c *cache) flushWorker(_ int) {
return return
} }
err := c.flushObject(obj) err := c.flushObject(obj, nil)
if err != nil { if err == nil {
c.log.Error("can't flush object to the main storage", zap.Error(err))
} else {
c.flushed.Add(objectCore.AddressOf(obj).EncodeToString(), true) c.flushed.Add(objectCore.AddressOf(obj).EncodeToString(), true)
} }
} }
} }
// flushObject is used to write object directly to the main storage. // flushObject is used to write object directly to the main storage.
func (c *cache) flushObject(obj *object.Object) error { func (c *cache) flushObject(obj *object.Object, data []byte) error {
addr := objectCore.AddressOf(obj)
var prm common.PutPrm var prm common.PutPrm
prm.Object = obj prm.Object = obj
prm.RawData = data
res, err := c.blobstor.Put(prm) res, err := c.blobstor.Put(prm)
if err != nil { if err != nil {
if !errors.Is(err, common.ErrNoSpace) && !errors.Is(err, common.ErrReadOnly) &&
!errors.Is(err, blobstor.ErrNoPlaceFound) {
c.reportFlushError("can't flush an object to blobstor",
addr.EncodeToString(), err)
}
return err return err
} }
var updPrm meta.UpdateStorageIDPrm var updPrm meta.UpdateStorageIDPrm
updPrm.SetAddress(objectCore.AddressOf(obj)) updPrm.SetAddress(addr)
updPrm.SetStorageID(res.StorageID) updPrm.SetStorageID(res.StorageID)
_, err = c.metabase.UpdateStorageID(updPrm) _, err = c.metabase.UpdateStorageID(updPrm)
if err != nil {
c.reportFlushError("can't update object storage ID",
addr.EncodeToString(), err)
}
return err return err
} }
@ -280,6 +285,7 @@ func (c *cache) flush(ignoreErrors bool) error {
} }
if err := addr.DecodeString(sa); err != nil { if err := addr.DecodeString(sa); err != nil {
c.reportFlushError("can't decode object address from the DB", sa, err)
if ignoreErrors { if ignoreErrors {
continue continue
} }
@ -288,13 +294,14 @@ func (c *cache) flush(ignoreErrors bool) error {
var obj object.Object var obj object.Object
if err := obj.Unmarshal(data); err != nil { if err := obj.Unmarshal(data); err != nil {
c.reportFlushError("can't unmarshal an object from the DB", sa, err)
if ignoreErrors { if ignoreErrors {
continue continue
} }
return err return err
} }
if err := c.flushObject(&obj); err != nil { if err := c.flushObject(&obj, data); err != nil {
return err return err
} }
} }

View file

@ -21,6 +21,7 @@ import (
versionSDK "github.com/nspcc-dev/neofs-sdk-go/version" versionSDK "github.com/nspcc-dev/neofs-sdk-go/version"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"go.etcd.io/bbolt" "go.etcd.io/bbolt"
"go.uber.org/atomic"
"go.uber.org/zap/zaptest" "go.uber.org/zap/zaptest"
) )
@ -35,7 +36,7 @@ func TestFlush(t *testing.T) {
obj *object.Object obj *object.Object
} }
newCache := func(t *testing.T) (Cache, *blobstor.BlobStor, *meta.DB) { newCache := func(t *testing.T, opts ...Option) (Cache, *blobstor.BlobStor, *meta.DB) {
dir := t.TempDir() dir := t.TempDir()
mb := meta.New( mb := meta.New(
meta.WithPath(filepath.Join(dir, "meta")), meta.WithPath(filepath.Join(dir, "meta")),
@ -54,11 +55,13 @@ func TestFlush(t *testing.T) {
require.NoError(t, bs.Init()) require.NoError(t, bs.Init())
wc := New( wc := New(
append([]Option{
WithLogger(&logger.Logger{Logger: zaptest.NewLogger(t)}), WithLogger(&logger.Logger{Logger: zaptest.NewLogger(t)}),
WithPath(filepath.Join(dir, "writecache")), WithPath(filepath.Join(dir, "writecache")),
WithSmallObjectSize(smallSize), WithSmallObjectSize(smallSize),
WithMetabase(mb), WithMetabase(mb),
WithBlobstor(bs)) WithBlobstor(bs),
}, opts...)...)
require.NoError(t, wc.Open(false)) require.NoError(t, wc.Open(false))
require.NoError(t, wc.Init()) require.NoError(t, wc.Init())
@ -164,7 +167,10 @@ func TestFlush(t *testing.T) {
t.Run("ignore errors", func(t *testing.T) { t.Run("ignore errors", func(t *testing.T) {
testIgnoreErrors := func(t *testing.T, f func(*cache)) { testIgnoreErrors := func(t *testing.T, f func(*cache)) {
wc, bs, mb := newCache(t) var errCount atomic.Uint32
wc, bs, mb := newCache(t, WithReportErrorFunc(func(message string, err error) {
errCount.Inc()
}))
objects := putObjects(t, wc) objects := putObjects(t, wc)
f(wc.(*cache)) f(wc.(*cache))
@ -172,7 +178,9 @@ func TestFlush(t *testing.T) {
require.NoError(t, bs.SetMode(mode.ReadWrite)) require.NoError(t, bs.SetMode(mode.ReadWrite))
require.NoError(t, mb.SetMode(mode.ReadWrite)) require.NoError(t, mb.SetMode(mode.ReadWrite))
require.Equal(t, uint32(0), errCount.Load())
require.Error(t, wc.Flush(false)) require.Error(t, wc.Flush(false))
require.True(t, errCount.Load() > 0)
require.NoError(t, wc.Flush(true)) require.NoError(t, wc.Flush(true))
check(t, mb, bs, objects) check(t, mb, bs, objects)

View file

@ -52,6 +52,8 @@ type options struct {
maxBatchDelay time.Duration maxBatchDelay time.Duration
// noSync is true iff FSTree allows unsynchronized writes. // noSync is true iff FSTree allows unsynchronized writes.
noSync bool noSync bool
// reportError is the function called when encountering disk errors in background workers.
reportError func(string, error)
} }
// WithLogger sets logger. // WithLogger sets logger.
@ -142,3 +144,10 @@ func WithNoSync(noSync bool) Option {
o.noSync = noSync o.noSync = noSync
} }
} }
// WithReportErrorFunc sets error reporting function.
func WithReportErrorFunc(f func(string, error)) Option {
return func(o *options) {
o.reportError = f
}
}