node: Add metrics for the error counter in the engine #418
6 changed files with 71 additions and 0 deletions
|
@ -132,6 +132,7 @@ func (e *StorageEngine) reportShardErrorBackground(id string, msg string, err er
|
|||
}
|
||||
|
||||
errCount := sh.errorCount.Add(1)
|
||||
sh.Shard.IncErrorCounter()
|
||||
ale64bit marked this conversation as resolved
Outdated
|
||||
e.reportShardErrorWithFlags(sh.Shard, errCount, false, msg, err)
|
||||
}
|
||||
|
||||
|
@ -150,6 +151,7 @@ func (e *StorageEngine) reportShardError(
|
|||
}
|
||||
|
||||
errCount := sh.errorCount.Add(1)
|
||||
sh.Shard.IncErrorCounter()
|
||||
e.reportShardErrorWithFlags(sh.Shard, errCount, true, msg, err, fields...)
|
||||
}
|
||||
|
||||
|
|
|
@ -26,6 +26,9 @@ type MetricRegister interface {
|
|||
|
||||
AddToContainerSize(cnrID string, size int64)
|
||||
AddToPayloadCounter(shardID string, size int64)
|
||||
IncErrorCounter(shardID string)
|
||||
ClearErrorCounter(shardID string)
|
||||
DeleteErrorCounter(shardID string)
|
||||
|
||||
WriteCache() metrics.WriteCacheMetrics
|
||||
GC() metrics.GCMetrics
|
||||
|
|
|
@ -62,6 +62,18 @@ func (m *metricsWithID) AddToPayloadSize(size int64) {
|
|||
m.mw.AddToPayloadCounter(m.id, size)
|
||||
}
|
||||
|
||||
func (m *metricsWithID) IncErrorCounter() {
|
||||
m.mw.IncErrorCounter(m.id)
|
||||
}
|
||||
|
||||
func (m *metricsWithID) ClearErrorCounter() {
|
||||
m.mw.ClearErrorCounter(m.id)
|
||||
}
|
||||
|
||||
func (m *metricsWithID) DeleteErrorCounter() {
|
||||
m.mw.DeleteErrorCounter(m.id)
|
||||
}
|
||||
|
||||
// AddShard adds a new shard to the storage engine.
|
||||
//
|
||||
// Returns any error encountered that did not allow adding a shard.
|
||||
|
@ -174,6 +186,8 @@ func (e *StorageEngine) removeShards(ids ...string) {
|
|||
continue
|
||||
}
|
||||
|
||||
sh.DeleteErrorCounter()
|
||||
dstepanov-yadro marked this conversation as resolved
Outdated
dstepanov-yadro
commented
Scenario: There were 5 shards, one of them did not work due to errors. I deleted that shard so the server would continue to work normal. After that, I decided to investigate, but there is no metrics for this shard. So i think we should not delete metrics when delete shard. Scenario: There were 5 shards, one of them did not work due to errors. I deleted that shard so the server would continue to work normal. After that, I decided to investigate, but there is no metrics for this shard.
So i think we should not delete metrics when delete shard.
|
||||
|
||||
ss = append(ss, sh)
|
||||
delete(e.shards, id)
|
||||
|
||||
|
@ -281,6 +295,7 @@ func (e *StorageEngine) SetShardMode(id *shard.ID, m mode.Mode, resetErrorCounte
|
|||
if id.String() == shID {
|
||||
if resetErrorCounter {
|
||||
sh.errorCount.Store(0)
|
||||
sh.Shard.ClearErrorCounter()
|
||||
}
|
||||
return sh.SetMode(m)
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ type metricsStore struct {
|
|||
cnrSize map[string]int64
|
||||
pldSize int64
|
||||
readOnly bool
|
||||
errCounter int64
|
||||
}
|
||||
|
||||
func (m metricsStore) SetShardID(_ string) {}
|
||||
|
@ -68,6 +69,18 @@ func (m *metricsStore) AddToPayloadSize(size int64) {
|
|||
m.pldSize += size
|
||||
}
|
||||
|
||||
func (m *metricsStore) IncErrorCounter() {
|
||||
m.errCounter += 1
|
||||
}
|
||||
|
||||
func (m *metricsStore) ClearErrorCounter() {
|
||||
ale64bit marked this conversation as resolved
Outdated
ale64bit
commented
do we really need two functions that do the same with different names? do we really need two functions that do the same with different names?
acid-ant
commented
It is in tests, and it is necessary to implement two methods It is in tests, and it is necessary to implement two methods `ClearErrorCounter` and `DeleteErrorCounter` for type `metricsStore` somehow.
|
||||
m.errCounter = 0
|
||||
}
|
||||
|
||||
func (m *metricsStore) DeleteErrorCounter() {
|
||||
m.errCounter = 0
|
||||
}
|
||||
|
||||
const physical = "phy"
|
||||
const logical = "logic"
|
||||
|
||||
|
|
|
@ -75,6 +75,12 @@ type MetricsWriter interface {
|
|||
SetShardID(id string)
|
||||
// SetReadonly must set shard readonly state.
|
||||
SetReadonly(readonly bool)
|
||||
// IncErrorCounter increment error counter.
|
||||
IncErrorCounter()
|
||||
// ClearErrorCounter clear error counter.
|
||||
ClearErrorCounter()
|
||||
// DeleteErrorCounter delete error counter.
|
||||
DeleteErrorCounter()
|
||||
}
|
||||
|
||||
type cfg struct {
|
||||
|
@ -428,3 +434,21 @@ func (s *Shard) addToPayloadSize(size int64) {
|
|||
s.cfg.metricsWriter.AddToPayloadSize(size)
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Shard) IncErrorCounter() {
|
||||
if s.cfg.metricsWriter != nil {
|
||||
s.cfg.metricsWriter.IncErrorCounter()
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Shard) ClearErrorCounter() {
|
||||
if s.cfg.metricsWriter != nil {
|
||||
s.cfg.metricsWriter.ClearErrorCounter()
|
||||
}
|
||||
}
|
||||
|
||||
func (s *Shard) DeleteErrorCounter() {
|
||||
if s.cfg.metricsWriter != nil {
|
||||
s.cfg.metricsWriter.DeleteErrorCounter()
|
||||
}
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@ type (
|
|||
listObjectsDuration prometheus.Counter
|
||||
containerSize *prometheus.GaugeVec
|
||||
payloadSize *prometheus.GaugeVec
|
||||
errorCounter *prometheus.GaugeVec
|
||||
}
|
||||
)
|
||||
|
||||
|
@ -44,6 +45,7 @@ func newEngineMetrics() engineMetrics {
|
|||
listObjectsDuration: newEngineMethodDurationCounter("list_objects"),
|
||||
containerSize: newEngineGaugeVector("container_size", "Accumulated size of all objects in a container", []string{containerIDLabelKey}),
|
||||
payloadSize: newEngineGaugeVector("payload_size", "Accumulated size of all objects in a shard", []string{shardIDLabelKey}),
|
||||
errorCounter: newEngineGaugeVector("error_counter", "Shard's error counter", []string{shardIDLabelKey}),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -123,3 +125,15 @@ func (m engineMetrics) AddToContainerSize(cnrID string, size int64) {
|
|||
func (m engineMetrics) AddToPayloadCounter(shardID string, size int64) {
|
||||
m.payloadSize.With(prometheus.Labels{shardIDLabelKey: shardID}).Add(float64(size))
|
||||
}
|
||||
|
||||
func (m engineMetrics) IncErrorCounter(shardID string) {
|
||||
m.errorCounter.With(prometheus.Labels{shardIDLabelKey: shardID}).Inc()
|
||||
}
|
||||
|
||||
func (m engineMetrics) ClearErrorCounter(shardID string) {
|
||||
m.errorCounter.With(prometheus.Labels{shardIDLabelKey: shardID}).Set(0)
|
||||
}
|
||||
|
||||
func (m engineMetrics) DeleteErrorCounter(shardID string) {
|
||||
m.errorCounter.Delete(prometheus.Labels{shardIDLabelKey: shardID})
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue
consider using
Inc
if it's always 1Thanks, renamed.
Sorry, what I meant was that if the increment is always one, we can simply remove the delta everywhere and use the
Inc
method, e.g.:But up to you, whether we will need the
delta
later.Agree, renamed in all places.