[#370] Add tree service metrics #388

Merged
fyrchik merged 1 commits from ale64bit/frostfs-node:feature/370-metrics-tree-service into master 2023-05-26 13:39:14 +00:00
9 changed files with 117 additions and 3 deletions

View File

@ -10,6 +10,7 @@ Changelog for FrostFS Node
- Support copies number parameter in `frostfs-cli object put` (#351)
- Set extra wallets on SIGHUP for ir (#125)
- Writecache metrics (#312)
- Add tree service metrics (#370)
### Changed
- `frostfs-cli util locode generate` is now much faster (#309)

View File

@ -55,7 +55,8 @@ func initTreeService(c *cfg) {
tree.WithContainerCacheSize(treeConfig.CacheSize()),
tree.WithReplicationTimeout(treeConfig.ReplicationTimeout()),
tree.WithReplicationChannelCapacity(treeConfig.ReplicationChannelCapacity()),
tree.WithReplicationWorkerCount(treeConfig.ReplicationWorkerCount()))
tree.WithReplicationWorkerCount(treeConfig.ReplicationWorkerCount()),
tree.WithMetrics(c.metricsCollector.TreeService()))
for _, srv := range c.cfgGRPC.servers {
tree.RegisterTreeServiceServer(srv, c.treeService)

View File

@ -1,6 +1,9 @@
package metrics
import "github.com/prometheus/client_golang/prometheus"
import (
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/tree"
"github.com/prometheus/client_golang/prometheus"
)
const namespace = "frostfs_node"
@ -9,9 +12,10 @@ type NodeMetrics struct {
engineMetrics
stateMetrics
replicatorMetrics
epoch metric[prometheus.Gauge]

Why did we move this line?

Why did we move this line?

I moved it to clearly split the embedded fields from the others, including this one.

I moved it to clearly split the embedded fields from the others, including this one.
writeCacheMetrics *writeCacheMetrics
treeService *treeServiceMetrics
epoch metric[prometheus.Gauge]
}
func NewNodeMetrics() *NodeMetrics {
@ -27,6 +31,9 @@ func NewNodeMetrics() *NodeMetrics {
replicator := newReplicatorMetrics()
replicator.register()
treeService := newTreeServiceMetrics()
treeService.register()
epoch := newGauge(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: innerRingSubsystem,
@ -43,6 +50,7 @@ func NewNodeMetrics() *NodeMetrics {
engineMetrics: engine,
stateMetrics: state,
replicatorMetrics: replicator,
treeService: treeService,
epoch: epoch,
writeCacheMetrics: writeCacheMetrics,
}
@ -60,3 +68,7 @@ func (m *NodeMetrics) WriteCache() WriteCacheMetrics {
}
return m.writeCacheMetrics
}
func (m *NodeMetrics) TreeService() tree.MetricsRegister {
return m.treeService
}

View File

@ -0,0 +1,64 @@
package metrics
import (
"fmt"
"time"
"github.com/prometheus/client_golang/prometheus"
)
const treeServiceLabelSuccess = "success"
type treeServiceMetrics struct {
replicateTaskDuration metric[*prometheus.HistogramVec]
replicateWaitDuration metric[*prometheus.HistogramVec]
syncOpDuration metric[*prometheus.HistogramVec]
}
func newTreeServiceMetrics() *treeServiceMetrics {
const treeServiceSubsystem = "treeservice"
return &treeServiceMetrics{
replicateTaskDuration: newHistogramVec(prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: treeServiceSubsystem,
Name: "replicate_task_duration_seconds",
Help: "Duration of individual replication tasks executed as part of replication loops",
}, []string{treeServiceLabelSuccess}),
replicateWaitDuration: newHistogramVec(prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: treeServiceSubsystem,
Name: "replicate_wait_duration_seconds",
Help: "Duration of overall waiting time for replication loops",
}, []string{treeServiceLabelSuccess}),
syncOpDuration: newHistogramVec(prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: treeServiceSubsystem,
Name: "sync_duration_seconds",
Help: "Duration of synchronization operations",
}, []string{treeServiceLabelSuccess}),
}
}
func (m *treeServiceMetrics) register() {
mustRegister(m.replicateTaskDuration)
mustRegister(m.replicateWaitDuration)
mustRegister(m.syncOpDuration)
}
func (m *treeServiceMetrics) AddReplicateTaskDuration(d time.Duration, success bool) {
m.replicateTaskDuration.value.With(prometheus.Labels{
dstepanov-yadro marked this conversation as resolved Outdated

@ale64bit @fyrchik Maybe it's not worth separating operations and errors, but using labels everywhere? In fact, there is an operation that ended with an error or without an error.

@ale64bit @fyrchik Maybe it's not worth separating operations and errors, but using labels everywhere? In fact, there is an operation that ended with an error or without an error.

I didn't fully understand what you meant by "In fact, there is an operation that ended with an error or without an error."?

I didn't fully understand what you meant by *"In fact, there is an operation that ended with an error or without an error."*?

I'm sorry, I didn't make myself clear enough.
This code:

			case <-s.syncChan:

			s.metrics.IncSyncOps()

			cnrs, err := s.cfg.cnrSource.List()
			if err != nil {
				s.metrics.IncSyncErrors()
				return
			}

Operations and errors are counted separately. I think it should be a single counter with label separation.
If we need to display the number of success operations, then we should use such expression: irate(sync_operation[1m]) - irate(sync_operation_errors[1m]) instead of irate(sync_operation{success="true"}[1m])

I'm sorry, I didn't make myself clear enough. This code: ``` case <-s.syncChan: s.metrics.IncSyncOps() cnrs, err := s.cfg.cnrSource.List() if err != nil { s.metrics.IncSyncErrors() return } ``` Operations and errors are counted separately. I think it should be a single counter with label separation. If we need to display the number of success operations, then we should use such expression: `irate(sync_operation[1m]) - irate(sync_operation_errors[1m])` instead of `irate(sync_operation{success="true"}[1m])`

You mean that we should just use a single HistogramVec for operations in general? (with a success label). Since it includes a count of the observations as well.

You mean that we should just use a single `HistogramVec` for operations in general? (with a `success` label). Since it includes a count of the observations as well.

I agree it's better. Done.

I agree it's better. Done.
treeServiceLabelSuccess: fmt.Sprintf("%v", success),
}).Observe(d.Seconds())
}
func (m *treeServiceMetrics) AddReplicateWaitDuration(d time.Duration, success bool) {
m.replicateWaitDuration.value.With(prometheus.Labels{
treeServiceLabelSuccess: fmt.Sprintf("%v", success),
}).Observe(d.Seconds())
}
func (m *treeServiceMetrics) AddSyncDuration(d time.Duration, success bool) {
m.syncOpDuration.value.With(prometheus.Labels{
treeServiceLabelSuccess: fmt.Sprintf("%v", success),
}).Observe(d.Seconds())
}

View File

@ -0,0 +1,15 @@
package tree
import "time"
type MetricsRegister interface {
AddReplicateTaskDuration(time.Duration, bool)
AddReplicateWaitDuration(time.Duration, bool)
AddSyncDuration(time.Duration, bool)
}
type defaultMetricsRegister struct{}
func (defaultMetricsRegister) AddReplicateTaskDuration(time.Duration, bool) {}
func (defaultMetricsRegister) AddReplicateWaitDuration(time.Duration, bool) {}
func (defaultMetricsRegister) AddSyncDuration(time.Duration, bool) {}

View File

@ -33,6 +33,8 @@ type cfg struct {
replicatorWorkerCount int
replicatorTimeout time.Duration
containerCacheSize int
metrics MetricsRegister
}
// Option represents configuration option for a tree service.
@ -116,3 +118,9 @@ func WithReplicationTimeout(t time.Duration) Option {
}
}
}
func WithMetrics(v MetricsRegister) Option {
return func(c *cfg) {
c.metrics = v
}
fyrchik marked this conversation as resolved Outdated

What about sane dummy non-null default?

What about sane dummy non-null default?
}

View File

@ -75,6 +75,7 @@ func (s *Service) replicationWorker(ctx context.Context) {
attribute.String("public_key", hex.EncodeToString(task.n.PublicKey())),
),
)
start := time.Now()
var lastErr error
var lastAddr string
@ -113,6 +114,9 @@ func (s *Service) replicationWorker(ctx context.Context) {
zap.String("address", lastAddr),
zap.String("key", hex.EncodeToString(task.n.PublicKey())))
}
s.metrics.AddReplicateTaskDuration(time.Since(start), false)
} else {
s.metrics.AddReplicateTaskDuration(time.Since(start), true)
}
span.End()
}
@ -137,6 +141,7 @@ func (s *Service) replicateLoop(ctx context.Context) {
case <-ctx.Done():
return
case op := <-s.replicateCh:
start := time.Now()
err := s.replicate(op)
if err != nil {
s.log.Error(logs.TreeErrorDuringReplication,
@ -144,6 +149,7 @@ func (s *Service) replicateLoop(ctx context.Context) {
zap.Stringer("cid", op.cid),
zap.String("treeID", op.treeID))
fyrchik marked this conversation as resolved Outdated

Just to be aware: replicate does 3 things:

  1. Signs the operation.
  2. Calculates the list op nodes to replicate to.
  3. Sends the request.

While this metric is useful (how much time do we wait in queue), can we also measure the time spent in replicationWorker?

Just to be aware: `replicate` does 3 things: 1. Signs the operation. 2. Calculates the list op nodes to replicate to. 3. Sends the request. While this metric _is_ useful (how much time do we wait in queue), can we also measure the time spent in `replicationWorker`?
}
fyrchik marked this conversation as resolved Outdated

Can we unify 3 lines with s.metrics.AddReplicateWaitDuration(time.Since(start), err == nil)?

Can we unify 3 lines with `s.metrics.AddReplicateWaitDuration(time.Since(start), err == nil)`?
s.metrics.AddReplicateWaitDuration(time.Since(start), err == nil)
}
}
}

View File

@ -46,6 +46,7 @@ func New(opts ...Option) *Service {
s.replicatorChannelCapacity = defaultReplicatorCapacity
s.replicatorWorkerCount = defaultReplicatorWorkerCount
s.replicatorTimeout = defaultReplicatorSendTimeout
s.metrics = defaultMetricsRegister{}
for i := range opts {
opts[i](&s.cfg)

View File

@ -9,6 +9,7 @@ import (
"math"
"math/rand"
"sync"
"time"
"git.frostfs.info/TrueCloudLab/frostfs-api-go/v2/pkg/tracing"
"git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs"
@ -376,9 +377,12 @@ func (s *Service) syncLoop(ctx context.Context) {
ctx, span := tracing.StartSpanFromContext(ctx, "TreeService.sync")
s.log.Debug(logs.TreeSyncingTrees)

If by "ahh" you mean "I don't like this empty line", I removed it. I wish gofmt would eventually have a say on empty lines, to avoid wasting time discussing them.

If by "ahh" you mean "I don't like this empty line", I removed it. I wish `gofmt` would eventually have a say on empty lines, to avoid wasting time discussing them.

It's not about an empty line per se, it's about unrelated changes in the commit.

It's not about an empty line per se, it's about unrelated changes in the commit.
start := time.Now()
cnrs, err := s.cfg.cnrSource.List()
if err != nil {
s.log.Error(logs.TreeCouldNotFetchContainers, zap.Error(err))
s.metrics.AddSyncDuration(time.Since(start), false)
span.End()
continue
}
@ -390,6 +394,8 @@ func (s *Service) syncLoop(ctx context.Context) {
s.removeContainers(ctx, newMap)
s.log.Debug(logs.TreeTreesHaveBeenSynchronized)
s.metrics.AddSyncDuration(time.Since(start), true)
span.End()
}
}