From bfded3a547ac6245b993e550960236139994160e Mon Sep 17 00:00:00 2001 From: Alejandro Lopez Date: Wed, 24 May 2023 10:01:50 +0300 Subject: [PATCH] [#370] Add tree service metrics Signed-off-by: Alejandro Lopez --- CHANGELOG.md | 1 + cmd/frostfs-node/tree.go | 3 +- pkg/metrics/node.go | 16 +++++++-- pkg/metrics/treeservice.go | 64 +++++++++++++++++++++++++++++++++ pkg/services/tree/metrics.go | 15 ++++++++ pkg/services/tree/options.go | 8 +++++ pkg/services/tree/replicator.go | 6 ++++ pkg/services/tree/service.go | 1 + pkg/services/tree/sync.go | 6 ++++ 9 files changed, 117 insertions(+), 3 deletions(-) create mode 100644 pkg/metrics/treeservice.go create mode 100644 pkg/services/tree/metrics.go diff --git a/CHANGELOG.md b/CHANGELOG.md index b5924b781..83ec8e41a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ Changelog for FrostFS Node - Support copies number parameter in `frostfs-cli object put` (#351) - Set extra wallets on SIGHUP for ir (#125) - Writecache metrics (#312) +- Add tree service metrics (#370) ### Changed - `frostfs-cli util locode generate` is now much faster (#309) diff --git a/cmd/frostfs-node/tree.go b/cmd/frostfs-node/tree.go index b4f43acac..fffaa01d1 100644 --- a/cmd/frostfs-node/tree.go +++ b/cmd/frostfs-node/tree.go @@ -55,7 +55,8 @@ func initTreeService(c *cfg) { tree.WithContainerCacheSize(treeConfig.CacheSize()), tree.WithReplicationTimeout(treeConfig.ReplicationTimeout()), tree.WithReplicationChannelCapacity(treeConfig.ReplicationChannelCapacity()), - tree.WithReplicationWorkerCount(treeConfig.ReplicationWorkerCount())) + tree.WithReplicationWorkerCount(treeConfig.ReplicationWorkerCount()), + tree.WithMetrics(c.metricsCollector.TreeService())) for _, srv := range c.cfgGRPC.servers { tree.RegisterTreeServiceServer(srv, c.treeService) diff --git a/pkg/metrics/node.go b/pkg/metrics/node.go index b8041eec8..cca82b5fe 100644 --- a/pkg/metrics/node.go +++ b/pkg/metrics/node.go @@ -1,6 +1,9 @@ package metrics -import "github.com/prometheus/client_golang/prometheus" +import ( + "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/tree" + "github.com/prometheus/client_golang/prometheus" +) const namespace = "frostfs_node" @@ -9,9 +12,10 @@ type NodeMetrics struct { engineMetrics stateMetrics replicatorMetrics - epoch metric[prometheus.Gauge] writeCacheMetrics *writeCacheMetrics + treeService *treeServiceMetrics + epoch metric[prometheus.Gauge] } func NewNodeMetrics() *NodeMetrics { @@ -27,6 +31,9 @@ func NewNodeMetrics() *NodeMetrics { replicator := newReplicatorMetrics() replicator.register() + treeService := newTreeServiceMetrics() + treeService.register() + epoch := newGauge(prometheus.GaugeOpts{ Namespace: namespace, Subsystem: innerRingSubsystem, @@ -43,6 +50,7 @@ func NewNodeMetrics() *NodeMetrics { engineMetrics: engine, stateMetrics: state, replicatorMetrics: replicator, + treeService: treeService, epoch: epoch, writeCacheMetrics: writeCacheMetrics, } @@ -60,3 +68,7 @@ func (m *NodeMetrics) WriteCache() WriteCacheMetrics { } return m.writeCacheMetrics } + +func (m *NodeMetrics) TreeService() tree.MetricsRegister { + return m.treeService +} diff --git a/pkg/metrics/treeservice.go b/pkg/metrics/treeservice.go new file mode 100644 index 000000000..135f6e6d2 --- /dev/null +++ b/pkg/metrics/treeservice.go @@ -0,0 +1,64 @@ +package metrics + +import ( + "fmt" + "time" + + "github.com/prometheus/client_golang/prometheus" +) + +const treeServiceLabelSuccess = "success" + +type treeServiceMetrics struct { + replicateTaskDuration metric[*prometheus.HistogramVec] + replicateWaitDuration metric[*prometheus.HistogramVec] + syncOpDuration metric[*prometheus.HistogramVec] +} + +func newTreeServiceMetrics() *treeServiceMetrics { + const treeServiceSubsystem = "treeservice" + return &treeServiceMetrics{ + replicateTaskDuration: newHistogramVec(prometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: treeServiceSubsystem, + Name: "replicate_task_duration_seconds", + Help: "Duration of individual replication tasks executed as part of replication loops", + }, []string{treeServiceLabelSuccess}), + replicateWaitDuration: newHistogramVec(prometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: treeServiceSubsystem, + Name: "replicate_wait_duration_seconds", + Help: "Duration of overall waiting time for replication loops", + }, []string{treeServiceLabelSuccess}), + syncOpDuration: newHistogramVec(prometheus.HistogramOpts{ + Namespace: namespace, + Subsystem: treeServiceSubsystem, + Name: "sync_duration_seconds", + Help: "Duration of synchronization operations", + }, []string{treeServiceLabelSuccess}), + } +} + +func (m *treeServiceMetrics) register() { + mustRegister(m.replicateTaskDuration) + mustRegister(m.replicateWaitDuration) + mustRegister(m.syncOpDuration) +} + +func (m *treeServiceMetrics) AddReplicateTaskDuration(d time.Duration, success bool) { + m.replicateTaskDuration.value.With(prometheus.Labels{ + treeServiceLabelSuccess: fmt.Sprintf("%v", success), + }).Observe(d.Seconds()) +} + +func (m *treeServiceMetrics) AddReplicateWaitDuration(d time.Duration, success bool) { + m.replicateWaitDuration.value.With(prometheus.Labels{ + treeServiceLabelSuccess: fmt.Sprintf("%v", success), + }).Observe(d.Seconds()) +} + +func (m *treeServiceMetrics) AddSyncDuration(d time.Duration, success bool) { + m.syncOpDuration.value.With(prometheus.Labels{ + treeServiceLabelSuccess: fmt.Sprintf("%v", success), + }).Observe(d.Seconds()) +} diff --git a/pkg/services/tree/metrics.go b/pkg/services/tree/metrics.go new file mode 100644 index 000000000..0f0e4ee57 --- /dev/null +++ b/pkg/services/tree/metrics.go @@ -0,0 +1,15 @@ +package tree + +import "time" + +type MetricsRegister interface { + AddReplicateTaskDuration(time.Duration, bool) + AddReplicateWaitDuration(time.Duration, bool) + AddSyncDuration(time.Duration, bool) +} + +type defaultMetricsRegister struct{} + +func (defaultMetricsRegister) AddReplicateTaskDuration(time.Duration, bool) {} +func (defaultMetricsRegister) AddReplicateWaitDuration(time.Duration, bool) {} +func (defaultMetricsRegister) AddSyncDuration(time.Duration, bool) {} diff --git a/pkg/services/tree/options.go b/pkg/services/tree/options.go index d60bc14c5..bcaf21f92 100644 --- a/pkg/services/tree/options.go +++ b/pkg/services/tree/options.go @@ -33,6 +33,8 @@ type cfg struct { replicatorWorkerCount int replicatorTimeout time.Duration containerCacheSize int + + metrics MetricsRegister } // Option represents configuration option for a tree service. @@ -116,3 +118,9 @@ func WithReplicationTimeout(t time.Duration) Option { } } } + +func WithMetrics(v MetricsRegister) Option { + return func(c *cfg) { + c.metrics = v + } +} diff --git a/pkg/services/tree/replicator.go b/pkg/services/tree/replicator.go index 60d0eff50..7199dc40e 100644 --- a/pkg/services/tree/replicator.go +++ b/pkg/services/tree/replicator.go @@ -75,6 +75,7 @@ func (s *Service) replicationWorker(ctx context.Context) { attribute.String("public_key", hex.EncodeToString(task.n.PublicKey())), ), ) + start := time.Now() var lastErr error var lastAddr string @@ -113,6 +114,9 @@ func (s *Service) replicationWorker(ctx context.Context) { zap.String("address", lastAddr), zap.String("key", hex.EncodeToString(task.n.PublicKey()))) } + s.metrics.AddReplicateTaskDuration(time.Since(start), false) + } else { + s.metrics.AddReplicateTaskDuration(time.Since(start), true) } span.End() } @@ -137,6 +141,7 @@ func (s *Service) replicateLoop(ctx context.Context) { case <-ctx.Done(): return case op := <-s.replicateCh: + start := time.Now() err := s.replicate(op) if err != nil { s.log.Error(logs.TreeErrorDuringReplication, @@ -144,6 +149,7 @@ func (s *Service) replicateLoop(ctx context.Context) { zap.Stringer("cid", op.cid), zap.String("treeID", op.treeID)) } + s.metrics.AddReplicateWaitDuration(time.Since(start), err == nil) } } } diff --git a/pkg/services/tree/service.go b/pkg/services/tree/service.go index 546b7a207..96e547f36 100644 --- a/pkg/services/tree/service.go +++ b/pkg/services/tree/service.go @@ -46,6 +46,7 @@ func New(opts ...Option) *Service { s.replicatorChannelCapacity = defaultReplicatorCapacity s.replicatorWorkerCount = defaultReplicatorWorkerCount s.replicatorTimeout = defaultReplicatorSendTimeout + s.metrics = defaultMetricsRegister{} for i := range opts { opts[i](&s.cfg) diff --git a/pkg/services/tree/sync.go b/pkg/services/tree/sync.go index ed2455194..ec51c6bc6 100644 --- a/pkg/services/tree/sync.go +++ b/pkg/services/tree/sync.go @@ -9,6 +9,7 @@ import ( "math" "math/rand" "sync" + "time" "git.frostfs.info/TrueCloudLab/frostfs-api-go/v2/pkg/tracing" "git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs" @@ -376,9 +377,12 @@ func (s *Service) syncLoop(ctx context.Context) { ctx, span := tracing.StartSpanFromContext(ctx, "TreeService.sync") s.log.Debug(logs.TreeSyncingTrees) + start := time.Now() + cnrs, err := s.cfg.cnrSource.List() if err != nil { s.log.Error(logs.TreeCouldNotFetchContainers, zap.Error(err)) + s.metrics.AddSyncDuration(time.Since(start), false) span.End() continue } @@ -390,6 +394,8 @@ func (s *Service) syncLoop(ctx context.Context) { s.removeContainers(ctx, newMap) s.log.Debug(logs.TreeTreesHaveBeenSynchronized) + + s.metrics.AddSyncDuration(time.Since(start), true) span.End() } }