[#1818] writecache: Increase error counter on background errors

Signed-off-by: Evgenii Stratonikov <evgeniy@morphbits.ru>
This commit is contained in:
Evgenii Stratonikov 2022-10-20 13:40:25 +03:00 committed by fyrchik
parent a4a6d547a8
commit d2cce62934
7 changed files with 105 additions and 48 deletions

View file

@ -13,6 +13,7 @@ Changelog for NeoFS Node
- Do not panic with bad inputs for `GET_RANGE` (#2007)
- Correctly select the shard for applying tree service operations (#1996)
- Physical child object removal by GC (#1699)
- Increase error counter for write-cache flush errors (#1818)
### Removed
### Updated

View file

@ -35,6 +35,25 @@ type shardWrapper struct {
*shard.Shard
}
// reportShardErrorBackground increases shard error counter and logs an error.
// It is intended to be used from background workers and
// doesn't change shard mode because of possible deadlocks.
func (e *StorageEngine) reportShardErrorBackground(id string, msg string, err error) {
e.mtx.RLock()
sh, ok := e.shards[id]
e.mtx.RUnlock()
if !ok {
return
}
errCount := sh.errorCount.Inc()
e.log.Warn(msg,
zap.String("shard_id", id),
zap.Uint32("error count", errCount),
zap.String("error", err.Error()))
}
// reportShardError checks that the amount of errors doesn't exceed the configured threshold.
// If it does, shard is set to read-only mode.
func (e *StorageEngine) reportShardError(

View file

@ -87,6 +87,7 @@ func (e *StorageEngine) createShard(opts []shard.Option) (*shard.Shard, error) {
shard.WithExpiredTombstonesCallback(e.processExpiredTombstones),
shard.WithExpiredLocksCallback(e.processExpiredLocks),
shard.WithDeletedLockCallback(e.processDeletedLocks),
shard.WithReportErrorFunc(e.reportShardErrorBackground),
)...)
if err := sh.UpdateID(); err != nil {

View file

@ -96,13 +96,16 @@ type cfg struct {
tsSource TombstoneSource
metricsWriter MetricsWriter
reportErrorFunc func(selfID string, message string, err error)
}
func defaultCfg() *cfg {
return &cfg{
rmBatchSize: 100,
log: &logger.Logger{Logger: zap.L()},
gcCfg: defaultGCCfg(),
rmBatchSize: 100,
log: &logger.Logger{Logger: zap.L()},
gcCfg: defaultGCCfg(),
reportErrorFunc: func(string, string, error) {},
}
}
@ -117,20 +120,21 @@ func New(opts ...Option) *Shard {
bs := blobstor.New(c.blobOpts...)
mb := meta.New(c.metaOpts...)
var writeCache writecache.Cache
if c.useWriteCache {
writeCache = writecache.New(
append(c.writeCacheOpts,
writecache.WithBlobstor(bs),
writecache.WithMetabase(mb))...)
s := &Shard{
cfg: c,
blobStor: bs,
metaBase: mb,
tsSource: c.tsSource,
}
s := &Shard{
cfg: c,
blobStor: bs,
metaBase: mb,
writeCache: writeCache,
tsSource: c.tsSource,
if c.useWriteCache {
s.writeCache = writecache.New(
append(c.writeCacheOpts,
writecache.WithReportErrorFunc(func(msg string, err error) {
s.reportErrorFunc(s.ID().String(), msg, err)
}),
writecache.WithBlobstor(bs),
writecache.WithMetabase(mb))...)
}
if s.piloramaOpts != nil {
@ -281,6 +285,14 @@ func WithMetricsWriter(v MetricsWriter) Option {
}
}
// WithReportErrorFunc returns option to specify callback for handling storage-related errors
// in the background workers.
func WithReportErrorFunc(f func(selfID string, message string, err error)) Option {
return func(c *cfg) {
c.reportErrorFunc = f
}
}
func (s *Shard) fillInfo() {
s.cfg.info.MetaBaseInfo = s.metaBase.DumpInfo()
s.cfg.info.BlobStorInfo = s.blobStor.DumpInfo()

View file

@ -7,6 +7,7 @@ import (
"github.com/mr-tron/base58"
"github.com/nspcc-dev/neo-go/pkg/util/slice"
objectCore "github.com/nspcc-dev/neofs-node/pkg/core/object"
"github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor"
"github.com/nspcc-dev/neofs-node/pkg/local_object_storage/blobstor/common"
meta "github.com/nspcc-dev/neofs-node/pkg/local_object_storage/metabase"
"github.com/nspcc-dev/neofs-sdk-go/object"
@ -146,6 +147,16 @@ func (c *cache) flushBigObjects() {
}
}
func (c *cache) reportFlushError(msg string, addr string, err error) {
if c.reportError != nil {
c.reportError(msg, err)
} else {
c.log.Error(msg,
zap.String("address", addr),
zap.Error(err))
}
}
func (c *cache) flushFSTree(ignoreErrors bool) error {
var prm common.IteratePrm
prm.IgnoreErrors = ignoreErrors
@ -158,8 +169,8 @@ func (c *cache) flushFSTree(ignoreErrors bool) error {
data, err := f()
if err != nil {
c.reportFlushError("can't read a file", sAddr, err)
if ignoreErrors {
c.log.Error("can't read a file", zap.Stringer("address", addr))
return nil
}
return err
@ -168,37 +179,21 @@ func (c *cache) flushFSTree(ignoreErrors bool) error {
var obj object.Object
err = obj.Unmarshal(data)
if err != nil {
c.reportFlushError("can't unmarshal an object", sAddr, err)
if ignoreErrors {
c.log.Error("can't unmarshal an object", zap.Stringer("address", addr))
return nil
}
return err
}
var prm common.PutPrm
prm.Address = addr
prm.Object = &obj
prm.RawData = data
res, err := c.blobstor.Put(prm)
err = c.flushObject(&obj, data)
if err != nil {
if ignoreErrors {
c.log.Error("cant flush object to blobstor", zap.Error(err))
return nil
}
return err
}
var updPrm meta.UpdateStorageIDPrm
updPrm.SetAddress(addr)
updPrm.SetStorageID(res.StorageID)
_, err = c.metabase.UpdateStorageID(updPrm)
if err != nil {
c.log.Error("failed to update storage ID in metabase", zap.Error(err))
return nil
}
// mark object as flushed
c.flushed.Add(sAddr, false)
@ -222,30 +217,40 @@ func (c *cache) flushWorker(_ int) {
return
}
err := c.flushObject(obj)
if err != nil {
c.log.Error("can't flush object to the main storage", zap.Error(err))
} else {
err := c.flushObject(obj, nil)
if err == nil {
c.flushed.Add(objectCore.AddressOf(obj).EncodeToString(), true)
}
}
}
// flushObject is used to write object directly to the main storage.
func (c *cache) flushObject(obj *object.Object) error {
func (c *cache) flushObject(obj *object.Object, data []byte) error {
addr := objectCore.AddressOf(obj)
var prm common.PutPrm
prm.Object = obj
prm.RawData = data
res, err := c.blobstor.Put(prm)
if err != nil {
if !errors.Is(err, common.ErrNoSpace) && !errors.Is(err, common.ErrReadOnly) &&
!errors.Is(err, blobstor.ErrNoPlaceFound) {
c.reportFlushError("can't flush an object to blobstor",
addr.EncodeToString(), err)
}
return err
}
var updPrm meta.UpdateStorageIDPrm
updPrm.SetAddress(objectCore.AddressOf(obj))
updPrm.SetAddress(addr)
updPrm.SetStorageID(res.StorageID)
_, err = c.metabase.UpdateStorageID(updPrm)
if err != nil {
c.reportFlushError("can't update object storage ID",
addr.EncodeToString(), err)
}
return err
}
@ -280,6 +285,7 @@ func (c *cache) flush(ignoreErrors bool) error {
}
if err := addr.DecodeString(sa); err != nil {
c.reportFlushError("can't decode object address from the DB", sa, err)
if ignoreErrors {
continue
}
@ -288,13 +294,14 @@ func (c *cache) flush(ignoreErrors bool) error {
var obj object.Object
if err := obj.Unmarshal(data); err != nil {
c.reportFlushError("can't unmarshal an object from the DB", sa, err)
if ignoreErrors {
continue
}
return err
}
if err := c.flushObject(&obj); err != nil {
if err := c.flushObject(&obj, data); err != nil {
return err
}
}

View file

@ -21,6 +21,7 @@ import (
versionSDK "github.com/nspcc-dev/neofs-sdk-go/version"
"github.com/stretchr/testify/require"
"go.etcd.io/bbolt"
"go.uber.org/atomic"
"go.uber.org/zap/zaptest"
)
@ -35,7 +36,7 @@ func TestFlush(t *testing.T) {
obj *object.Object
}
newCache := func(t *testing.T) (Cache, *blobstor.BlobStor, *meta.DB) {
newCache := func(t *testing.T, opts ...Option) (Cache, *blobstor.BlobStor, *meta.DB) {
dir := t.TempDir()
mb := meta.New(
meta.WithPath(filepath.Join(dir, "meta")),
@ -54,11 +55,13 @@ func TestFlush(t *testing.T) {
require.NoError(t, bs.Init())
wc := New(
WithLogger(&logger.Logger{Logger: zaptest.NewLogger(t)}),
WithPath(filepath.Join(dir, "writecache")),
WithSmallObjectSize(smallSize),
WithMetabase(mb),
WithBlobstor(bs))
append([]Option{
WithLogger(&logger.Logger{Logger: zaptest.NewLogger(t)}),
WithPath(filepath.Join(dir, "writecache")),
WithSmallObjectSize(smallSize),
WithMetabase(mb),
WithBlobstor(bs),
}, opts...)...)
require.NoError(t, wc.Open(false))
require.NoError(t, wc.Init())
@ -164,7 +167,10 @@ func TestFlush(t *testing.T) {
t.Run("ignore errors", func(t *testing.T) {
testIgnoreErrors := func(t *testing.T, f func(*cache)) {
wc, bs, mb := newCache(t)
var errCount atomic.Uint32
wc, bs, mb := newCache(t, WithReportErrorFunc(func(message string, err error) {
errCount.Inc()
}))
objects := putObjects(t, wc)
f(wc.(*cache))
@ -172,7 +178,9 @@ func TestFlush(t *testing.T) {
require.NoError(t, bs.SetMode(mode.ReadWrite))
require.NoError(t, mb.SetMode(mode.ReadWrite))
require.Equal(t, uint32(0), errCount.Load())
require.Error(t, wc.Flush(false))
require.True(t, errCount.Load() > 0)
require.NoError(t, wc.Flush(true))
check(t, mb, bs, objects)

View file

@ -52,6 +52,8 @@ type options struct {
maxBatchDelay time.Duration
// noSync is true iff FSTree allows unsynchronized writes.
noSync bool
// reportError is the function called when encountering disk errors in background workers.
reportError func(string, error)
}
// WithLogger sets logger.
@ -142,3 +144,10 @@ func WithNoSync(noSync bool) Option {
o.noSync = noSync
}
}
// WithReportErrorFunc sets error reporting function.
func WithReportErrorFunc(f func(string, error)) Option {
return func(o *options) {
o.reportError = f
}
}