[#370] Add tree service metrics

Signed-off-by: Alejandro Lopez <a.lopez@yadro.com>
This commit is contained in:
Alejandro Lopez 2023-05-24 10:01:50 +03:00
parent a1823423cf
commit bfded3a547
9 changed files with 117 additions and 3 deletions

View file

@ -10,6 +10,7 @@ Changelog for FrostFS Node
- Support copies number parameter in `frostfs-cli object put` (#351) - Support copies number parameter in `frostfs-cli object put` (#351)
- Set extra wallets on SIGHUP for ir (#125) - Set extra wallets on SIGHUP for ir (#125)
- Writecache metrics (#312) - Writecache metrics (#312)
- Add tree service metrics (#370)
### Changed ### Changed
- `frostfs-cli util locode generate` is now much faster (#309) - `frostfs-cli util locode generate` is now much faster (#309)

View file

@ -55,7 +55,8 @@ func initTreeService(c *cfg) {
tree.WithContainerCacheSize(treeConfig.CacheSize()), tree.WithContainerCacheSize(treeConfig.CacheSize()),
tree.WithReplicationTimeout(treeConfig.ReplicationTimeout()), tree.WithReplicationTimeout(treeConfig.ReplicationTimeout()),
tree.WithReplicationChannelCapacity(treeConfig.ReplicationChannelCapacity()), tree.WithReplicationChannelCapacity(treeConfig.ReplicationChannelCapacity()),
tree.WithReplicationWorkerCount(treeConfig.ReplicationWorkerCount())) tree.WithReplicationWorkerCount(treeConfig.ReplicationWorkerCount()),
tree.WithMetrics(c.metricsCollector.TreeService()))
for _, srv := range c.cfgGRPC.servers { for _, srv := range c.cfgGRPC.servers {
tree.RegisterTreeServiceServer(srv, c.treeService) tree.RegisterTreeServiceServer(srv, c.treeService)

View file

@ -1,6 +1,9 @@
package metrics package metrics
import "github.com/prometheus/client_golang/prometheus" import (
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/tree"
"github.com/prometheus/client_golang/prometheus"
)
const namespace = "frostfs_node" const namespace = "frostfs_node"
@ -9,9 +12,10 @@ type NodeMetrics struct {
engineMetrics engineMetrics
stateMetrics stateMetrics
replicatorMetrics replicatorMetrics
epoch metric[prometheus.Gauge]
writeCacheMetrics *writeCacheMetrics writeCacheMetrics *writeCacheMetrics
treeService *treeServiceMetrics
epoch metric[prometheus.Gauge]
} }
func NewNodeMetrics() *NodeMetrics { func NewNodeMetrics() *NodeMetrics {
@ -27,6 +31,9 @@ func NewNodeMetrics() *NodeMetrics {
replicator := newReplicatorMetrics() replicator := newReplicatorMetrics()
replicator.register() replicator.register()
treeService := newTreeServiceMetrics()
treeService.register()
epoch := newGauge(prometheus.GaugeOpts{ epoch := newGauge(prometheus.GaugeOpts{
Namespace: namespace, Namespace: namespace,
Subsystem: innerRingSubsystem, Subsystem: innerRingSubsystem,
@ -43,6 +50,7 @@ func NewNodeMetrics() *NodeMetrics {
engineMetrics: engine, engineMetrics: engine,
stateMetrics: state, stateMetrics: state,
replicatorMetrics: replicator, replicatorMetrics: replicator,
treeService: treeService,
epoch: epoch, epoch: epoch,
writeCacheMetrics: writeCacheMetrics, writeCacheMetrics: writeCacheMetrics,
} }
@ -60,3 +68,7 @@ func (m *NodeMetrics) WriteCache() WriteCacheMetrics {
} }
return m.writeCacheMetrics return m.writeCacheMetrics
} }
func (m *NodeMetrics) TreeService() tree.MetricsRegister {
return m.treeService
}

View file

@ -0,0 +1,64 @@
package metrics
import (
"fmt"
"time"
"github.com/prometheus/client_golang/prometheus"
)
const treeServiceLabelSuccess = "success"
type treeServiceMetrics struct {
replicateTaskDuration metric[*prometheus.HistogramVec]
replicateWaitDuration metric[*prometheus.HistogramVec]
syncOpDuration metric[*prometheus.HistogramVec]
}
func newTreeServiceMetrics() *treeServiceMetrics {
const treeServiceSubsystem = "treeservice"
return &treeServiceMetrics{
replicateTaskDuration: newHistogramVec(prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: treeServiceSubsystem,
Name: "replicate_task_duration_seconds",
Help: "Duration of individual replication tasks executed as part of replication loops",
}, []string{treeServiceLabelSuccess}),
replicateWaitDuration: newHistogramVec(prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: treeServiceSubsystem,
Name: "replicate_wait_duration_seconds",
Help: "Duration of overall waiting time for replication loops",
}, []string{treeServiceLabelSuccess}),
syncOpDuration: newHistogramVec(prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: treeServiceSubsystem,
Name: "sync_duration_seconds",
Help: "Duration of synchronization operations",
}, []string{treeServiceLabelSuccess}),
}
}
func (m *treeServiceMetrics) register() {
mustRegister(m.replicateTaskDuration)
mustRegister(m.replicateWaitDuration)
mustRegister(m.syncOpDuration)
}
func (m *treeServiceMetrics) AddReplicateTaskDuration(d time.Duration, success bool) {
m.replicateTaskDuration.value.With(prometheus.Labels{
treeServiceLabelSuccess: fmt.Sprintf("%v", success),
}).Observe(d.Seconds())
}
func (m *treeServiceMetrics) AddReplicateWaitDuration(d time.Duration, success bool) {
m.replicateWaitDuration.value.With(prometheus.Labels{
treeServiceLabelSuccess: fmt.Sprintf("%v", success),
}).Observe(d.Seconds())
}
func (m *treeServiceMetrics) AddSyncDuration(d time.Duration, success bool) {
m.syncOpDuration.value.With(prometheus.Labels{
treeServiceLabelSuccess: fmt.Sprintf("%v", success),
}).Observe(d.Seconds())
}

View file

@ -0,0 +1,15 @@
package tree
import "time"
type MetricsRegister interface {
AddReplicateTaskDuration(time.Duration, bool)
AddReplicateWaitDuration(time.Duration, bool)
AddSyncDuration(time.Duration, bool)
}
type defaultMetricsRegister struct{}
func (defaultMetricsRegister) AddReplicateTaskDuration(time.Duration, bool) {}
func (defaultMetricsRegister) AddReplicateWaitDuration(time.Duration, bool) {}
func (defaultMetricsRegister) AddSyncDuration(time.Duration, bool) {}

View file

@ -33,6 +33,8 @@ type cfg struct {
replicatorWorkerCount int replicatorWorkerCount int
replicatorTimeout time.Duration replicatorTimeout time.Duration
containerCacheSize int containerCacheSize int
metrics MetricsRegister
} }
// Option represents configuration option for a tree service. // Option represents configuration option for a tree service.
@ -116,3 +118,9 @@ func WithReplicationTimeout(t time.Duration) Option {
} }
} }
} }
func WithMetrics(v MetricsRegister) Option {
return func(c *cfg) {
c.metrics = v
}
}

View file

@ -75,6 +75,7 @@ func (s *Service) replicationWorker(ctx context.Context) {
attribute.String("public_key", hex.EncodeToString(task.n.PublicKey())), attribute.String("public_key", hex.EncodeToString(task.n.PublicKey())),
), ),
) )
start := time.Now()
var lastErr error var lastErr error
var lastAddr string var lastAddr string
@ -113,6 +114,9 @@ func (s *Service) replicationWorker(ctx context.Context) {
zap.String("address", lastAddr), zap.String("address", lastAddr),
zap.String("key", hex.EncodeToString(task.n.PublicKey()))) zap.String("key", hex.EncodeToString(task.n.PublicKey())))
} }
s.metrics.AddReplicateTaskDuration(time.Since(start), false)
} else {
s.metrics.AddReplicateTaskDuration(time.Since(start), true)
} }
span.End() span.End()
} }
@ -137,6 +141,7 @@ func (s *Service) replicateLoop(ctx context.Context) {
case <-ctx.Done(): case <-ctx.Done():
return return
case op := <-s.replicateCh: case op := <-s.replicateCh:
start := time.Now()
err := s.replicate(op) err := s.replicate(op)
if err != nil { if err != nil {
s.log.Error(logs.TreeErrorDuringReplication, s.log.Error(logs.TreeErrorDuringReplication,
@ -144,6 +149,7 @@ func (s *Service) replicateLoop(ctx context.Context) {
zap.Stringer("cid", op.cid), zap.Stringer("cid", op.cid),
zap.String("treeID", op.treeID)) zap.String("treeID", op.treeID))
} }
s.metrics.AddReplicateWaitDuration(time.Since(start), err == nil)
} }
} }
} }

View file

@ -46,6 +46,7 @@ func New(opts ...Option) *Service {
s.replicatorChannelCapacity = defaultReplicatorCapacity s.replicatorChannelCapacity = defaultReplicatorCapacity
s.replicatorWorkerCount = defaultReplicatorWorkerCount s.replicatorWorkerCount = defaultReplicatorWorkerCount
s.replicatorTimeout = defaultReplicatorSendTimeout s.replicatorTimeout = defaultReplicatorSendTimeout
s.metrics = defaultMetricsRegister{}
for i := range opts { for i := range opts {
opts[i](&s.cfg) opts[i](&s.cfg)

View file

@ -9,6 +9,7 @@ import (
"math" "math"
"math/rand" "math/rand"
"sync" "sync"
"time"
"git.frostfs.info/TrueCloudLab/frostfs-api-go/v2/pkg/tracing" "git.frostfs.info/TrueCloudLab/frostfs-api-go/v2/pkg/tracing"
"git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs" "git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs"
@ -376,9 +377,12 @@ func (s *Service) syncLoop(ctx context.Context) {
ctx, span := tracing.StartSpanFromContext(ctx, "TreeService.sync") ctx, span := tracing.StartSpanFromContext(ctx, "TreeService.sync")
s.log.Debug(logs.TreeSyncingTrees) s.log.Debug(logs.TreeSyncingTrees)
start := time.Now()
cnrs, err := s.cfg.cnrSource.List() cnrs, err := s.cfg.cnrSource.List()
if err != nil { if err != nil {
s.log.Error(logs.TreeCouldNotFetchContainers, zap.Error(err)) s.log.Error(logs.TreeCouldNotFetchContainers, zap.Error(err))
s.metrics.AddSyncDuration(time.Since(start), false)
span.End() span.End()
continue continue
} }
@ -390,6 +394,8 @@ func (s *Service) syncLoop(ctx context.Context) {
s.removeContainers(ctx, newMap) s.removeContainers(ctx, newMap)
s.log.Debug(logs.TreeTreesHaveBeenSynchronized) s.log.Debug(logs.TreeTreesHaveBeenSynchronized)
s.metrics.AddSyncDuration(time.Since(start), true)
span.End() span.End()
} }
} }