node: Add metrics for the error counter in the engine #418

Merged
realloc merged 1 commit from acid-ant/frostfs-node:feature/372-add-err-counter-shard into master 2023-06-07 13:04:48 +00:00
6 changed files with 71 additions and 0 deletions

View file

@ -132,6 +132,7 @@ func (e *StorageEngine) reportShardErrorBackground(id string, msg string, err er
}
errCount := sh.errorCount.Add(1)
sh.Shard.IncErrorCounter()
ale64bit marked this conversation as resolved Outdated

consider using Inc if it's always 1

consider using `Inc` if it's always 1

Thanks, renamed.

Thanks, renamed.

Sorry, what I meant was that if the increment is always one, we can simply remove the delta everywhere and use the Inc method, e.g.:


func (m engineMetrics) IncErrorCounter(shardID string) {
	m.errorCounter.value.With(prometheus.Labels{shardIDLabelKey: shardID}).Inc()
}

But up to you, whether we will need the delta later.

Sorry, what I meant was that if the increment is always one, we can simply remove the delta everywhere and use the `Inc` method, e.g.: ``` func (m engineMetrics) IncErrorCounter(shardID string) { m.errorCounter.value.With(prometheus.Labels{shardIDLabelKey: shardID}).Inc() } ``` But up to you, whether we will need the `delta` later.

Agree, renamed in all places.

Agree, renamed in all places.
e.reportShardErrorWithFlags(sh.Shard, errCount, false, msg, err)
}
@ -150,6 +151,7 @@ func (e *StorageEngine) reportShardError(
}
errCount := sh.errorCount.Add(1)
sh.Shard.IncErrorCounter()
e.reportShardErrorWithFlags(sh.Shard, errCount, true, msg, err, fields...)
}

View file

@ -26,6 +26,9 @@ type MetricRegister interface {
AddToContainerSize(cnrID string, size int64)
AddToPayloadCounter(shardID string, size int64)
IncErrorCounter(shardID string)
ClearErrorCounter(shardID string)
DeleteErrorCounter(shardID string)
WriteCache() metrics.WriteCacheMetrics
GC() metrics.GCMetrics

View file

@ -62,6 +62,18 @@ func (m *metricsWithID) AddToPayloadSize(size int64) {
m.mw.AddToPayloadCounter(m.id, size)
}
func (m *metricsWithID) IncErrorCounter() {
m.mw.IncErrorCounter(m.id)
}
func (m *metricsWithID) ClearErrorCounter() {
m.mw.ClearErrorCounter(m.id)
}
func (m *metricsWithID) DeleteErrorCounter() {
m.mw.DeleteErrorCounter(m.id)
}
// AddShard adds a new shard to the storage engine.
//
// Returns any error encountered that did not allow adding a shard.
@ -174,6 +186,8 @@ func (e *StorageEngine) removeShards(ids ...string) {
continue
}
sh.DeleteErrorCounter()
dstepanov-yadro marked this conversation as resolved Outdated

Scenario: There were 5 shards, one of them did not work due to errors. I deleted that shard so the server would continue to work normal. After that, I decided to investigate, but there is no metrics for this shard.

So i think we should not delete metrics when delete shard.

Scenario: There were 5 shards, one of them did not work due to errors. I deleted that shard so the server would continue to work normal. After that, I decided to investigate, but there is no metrics for this shard. So i think we should not delete metrics when delete shard.
ss = append(ss, sh)
delete(e.shards, id)
@ -281,6 +295,7 @@ func (e *StorageEngine) SetShardMode(id *shard.ID, m mode.Mode, resetErrorCounte
if id.String() == shID {
if resetErrorCounter {
sh.errorCount.Store(0)
sh.Shard.ClearErrorCounter()
}
return sh.SetMode(m)
}

View file

@ -23,6 +23,7 @@ type metricsStore struct {
cnrSize map[string]int64
pldSize int64
readOnly bool
errCounter int64
}
func (m metricsStore) SetShardID(_ string) {}
@ -68,6 +69,18 @@ func (m *metricsStore) AddToPayloadSize(size int64) {
m.pldSize += size
}
func (m *metricsStore) IncErrorCounter() {
m.errCounter += 1
}
func (m *metricsStore) ClearErrorCounter() {
ale64bit marked this conversation as resolved Outdated

do we really need two functions that do the same with different names?

do we really need two functions that do the same with different names?

It is in tests, and it is necessary to implement two methods ClearErrorCounter and DeleteErrorCounter for type metricsStore somehow.

It is in tests, and it is necessary to implement two methods `ClearErrorCounter` and `DeleteErrorCounter` for type `metricsStore` somehow.
m.errCounter = 0
}
func (m *metricsStore) DeleteErrorCounter() {
m.errCounter = 0
}
const physical = "phy"
const logical = "logic"

View file

@ -75,6 +75,12 @@ type MetricsWriter interface {
SetShardID(id string)
// SetReadonly must set shard readonly state.
SetReadonly(readonly bool)
// IncErrorCounter increment error counter.
IncErrorCounter()
// ClearErrorCounter clear error counter.
ClearErrorCounter()
// DeleteErrorCounter delete error counter.
DeleteErrorCounter()
}
type cfg struct {
@ -428,3 +434,21 @@ func (s *Shard) addToPayloadSize(size int64) {
s.cfg.metricsWriter.AddToPayloadSize(size)
}
}
func (s *Shard) IncErrorCounter() {
if s.cfg.metricsWriter != nil {
s.cfg.metricsWriter.IncErrorCounter()
}
}
func (s *Shard) ClearErrorCounter() {
if s.cfg.metricsWriter != nil {
s.cfg.metricsWriter.ClearErrorCounter()
}
}
func (s *Shard) DeleteErrorCounter() {
if s.cfg.metricsWriter != nil {
s.cfg.metricsWriter.DeleteErrorCounter()
}
}

View file

@ -24,6 +24,7 @@ type (
listObjectsDuration prometheus.Counter
containerSize *prometheus.GaugeVec
payloadSize *prometheus.GaugeVec
errorCounter *prometheus.GaugeVec
}
)
@ -44,6 +45,7 @@ func newEngineMetrics() engineMetrics {
listObjectsDuration: newEngineMethodDurationCounter("list_objects"),
containerSize: newEngineGaugeVector("container_size", "Accumulated size of all objects in a container", []string{containerIDLabelKey}),
payloadSize: newEngineGaugeVector("payload_size", "Accumulated size of all objects in a shard", []string{shardIDLabelKey}),
errorCounter: newEngineGaugeVector("error_counter", "Shard's error counter", []string{shardIDLabelKey}),
}
}
@ -123,3 +125,15 @@ func (m engineMetrics) AddToContainerSize(cnrID string, size int64) {
func (m engineMetrics) AddToPayloadCounter(shardID string, size int64) {
m.payloadSize.With(prometheus.Labels{shardIDLabelKey: shardID}).Add(float64(size))
}
func (m engineMetrics) IncErrorCounter(shardID string) {
m.errorCounter.With(prometheus.Labels{shardIDLabelKey: shardID}).Inc()
}
func (m engineMetrics) ClearErrorCounter(shardID string) {
m.errorCounter.With(prometheus.Labels{shardIDLabelKey: shardID}).Set(0)
}
func (m engineMetrics) DeleteErrorCounter(shardID string) {
m.errorCounter.Delete(prometheus.Labels{shardIDLabelKey: shardID})
}