[#370] Add tree service metrics

Signed-off-by: Alejandro Lopez <a.lopez@yadro.com>
This commit is contained in:
Alejandro Lopez 2023-05-24 10:01:50 +03:00 committed by Evgenii Stratonikov
parent f2e5dead7e
commit bc34fee6a7
9 changed files with 117 additions and 3 deletions

View file

@ -10,6 +10,7 @@ Changelog for FrostFS Node
- Support copies number parameter in `frostfs-cli object put` (#351)
- Set extra wallets on SIGHUP for ir (#125)
- Writecache metrics (#312)
- Add tree service metrics (#370)
### Changed
- `frostfs-cli util locode generate` is now much faster (#309)

View file

@ -55,7 +55,8 @@ func initTreeService(c *cfg) {
tree.WithContainerCacheSize(treeConfig.CacheSize()),
tree.WithReplicationTimeout(treeConfig.ReplicationTimeout()),
tree.WithReplicationChannelCapacity(treeConfig.ReplicationChannelCapacity()),
tree.WithReplicationWorkerCount(treeConfig.ReplicationWorkerCount()))
tree.WithReplicationWorkerCount(treeConfig.ReplicationWorkerCount()),
tree.WithMetrics(c.metricsCollector.TreeService()))
for _, srv := range c.cfgGRPC.servers {
tree.RegisterTreeServiceServer(srv, c.treeService)

View file

@ -1,6 +1,9 @@
package metrics
import "github.com/prometheus/client_golang/prometheus"
import (
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/tree"
"github.com/prometheus/client_golang/prometheus"
)
const namespace = "frostfs_node"
@ -9,9 +12,10 @@ type NodeMetrics struct {
engineMetrics
stateMetrics
replicatorMetrics
epoch metric[prometheus.Gauge]
writeCacheMetrics *writeCacheMetrics
treeService *treeServiceMetrics
epoch metric[prometheus.Gauge]
}
func NewNodeMetrics() *NodeMetrics {
@ -27,6 +31,9 @@ func NewNodeMetrics() *NodeMetrics {
replicator := newReplicatorMetrics()
replicator.register()
treeService := newTreeServiceMetrics()
treeService.register()
epoch := newGauge(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: innerRingSubsystem,
@ -43,6 +50,7 @@ func NewNodeMetrics() *NodeMetrics {
engineMetrics: engine,
stateMetrics: state,
replicatorMetrics: replicator,
treeService: treeService,
epoch: epoch,
writeCacheMetrics: writeCacheMetrics,
}
@ -60,3 +68,7 @@ func (m *NodeMetrics) WriteCache() WriteCacheMetrics {
}
return m.writeCacheMetrics
}
func (m *NodeMetrics) TreeService() tree.MetricsRegister {
return m.treeService
}

View file

@ -0,0 +1,64 @@
package metrics
import (
"fmt"
"time"
"github.com/prometheus/client_golang/prometheus"
)
const treeServiceLabelSuccess = "success"
type treeServiceMetrics struct {
replicateTaskDuration metric[*prometheus.HistogramVec]
replicateWaitDuration metric[*prometheus.HistogramVec]
syncOpDuration metric[*prometheus.HistogramVec]
}
func newTreeServiceMetrics() *treeServiceMetrics {
const treeServiceSubsystem = "treeservice"
return &treeServiceMetrics{
replicateTaskDuration: newHistogramVec(prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: treeServiceSubsystem,
Name: "replicate_task_duration_seconds",
Help: "Duration of individual replication tasks executed as part of replication loops",
}, []string{treeServiceLabelSuccess}),
replicateWaitDuration: newHistogramVec(prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: treeServiceSubsystem,
Name: "replicate_wait_duration_seconds",
Help: "Duration of overall waiting time for replication loops",
}, []string{treeServiceLabelSuccess}),
syncOpDuration: newHistogramVec(prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: treeServiceSubsystem,
Name: "sync_duration_seconds",
Help: "Duration of synchronization operations",
}, []string{treeServiceLabelSuccess}),
}
}
func (m *treeServiceMetrics) register() {
mustRegister(m.replicateTaskDuration)
mustRegister(m.replicateWaitDuration)
mustRegister(m.syncOpDuration)
}
func (m *treeServiceMetrics) AddReplicateTaskDuration(d time.Duration, success bool) {
m.replicateTaskDuration.value.With(prometheus.Labels{
treeServiceLabelSuccess: fmt.Sprintf("%v", success),
}).Observe(d.Seconds())
}
func (m *treeServiceMetrics) AddReplicateWaitDuration(d time.Duration, success bool) {
m.replicateWaitDuration.value.With(prometheus.Labels{
treeServiceLabelSuccess: fmt.Sprintf("%v", success),
}).Observe(d.Seconds())
}
func (m *treeServiceMetrics) AddSyncDuration(d time.Duration, success bool) {
m.syncOpDuration.value.With(prometheus.Labels{
treeServiceLabelSuccess: fmt.Sprintf("%v", success),
}).Observe(d.Seconds())
}

View file

@ -0,0 +1,15 @@
package tree
import "time"
type MetricsRegister interface {
AddReplicateTaskDuration(time.Duration, bool)
AddReplicateWaitDuration(time.Duration, bool)
AddSyncDuration(time.Duration, bool)
}
type defaultMetricsRegister struct{}
func (defaultMetricsRegister) AddReplicateTaskDuration(time.Duration, bool) {}
func (defaultMetricsRegister) AddReplicateWaitDuration(time.Duration, bool) {}
func (defaultMetricsRegister) AddSyncDuration(time.Duration, bool) {}

View file

@ -33,6 +33,8 @@ type cfg struct {
replicatorWorkerCount int
replicatorTimeout time.Duration
containerCacheSize int
metrics MetricsRegister
}
// Option represents configuration option for a tree service.
@ -116,3 +118,9 @@ func WithReplicationTimeout(t time.Duration) Option {
}
}
}
func WithMetrics(v MetricsRegister) Option {
return func(c *cfg) {
c.metrics = v
}
}

View file

@ -75,6 +75,7 @@ func (s *Service) replicationWorker(ctx context.Context) {
attribute.String("public_key", hex.EncodeToString(task.n.PublicKey())),
),
)
start := time.Now()
var lastErr error
var lastAddr string
@ -113,6 +114,9 @@ func (s *Service) replicationWorker(ctx context.Context) {
zap.String("address", lastAddr),
zap.String("key", hex.EncodeToString(task.n.PublicKey())))
}
s.metrics.AddReplicateTaskDuration(time.Since(start), false)
} else {
s.metrics.AddReplicateTaskDuration(time.Since(start), true)
}
span.End()
}
@ -137,6 +141,7 @@ func (s *Service) replicateLoop(ctx context.Context) {
case <-ctx.Done():
return
case op := <-s.replicateCh:
start := time.Now()
err := s.replicate(op)
if err != nil {
s.log.Error(logs.TreeErrorDuringReplication,
@ -144,6 +149,7 @@ func (s *Service) replicateLoop(ctx context.Context) {
zap.Stringer("cid", op.cid),
zap.String("treeID", op.treeID))
}
s.metrics.AddReplicateWaitDuration(time.Since(start), err == nil)
}
}
}

View file

@ -46,6 +46,7 @@ func New(opts ...Option) *Service {
s.replicatorChannelCapacity = defaultReplicatorCapacity
s.replicatorWorkerCount = defaultReplicatorWorkerCount
s.replicatorTimeout = defaultReplicatorSendTimeout
s.metrics = defaultMetricsRegister{}
for i := range opts {
opts[i](&s.cfg)

View file

@ -9,6 +9,7 @@ import (
"math"
"math/rand"
"sync"
"time"
"git.frostfs.info/TrueCloudLab/frostfs-api-go/v2/pkg/tracing"
"git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs"
@ -376,9 +377,12 @@ func (s *Service) syncLoop(ctx context.Context) {
ctx, span := tracing.StartSpanFromContext(ctx, "TreeService.sync")
s.log.Debug(logs.TreeSyncingTrees)
start := time.Now()
cnrs, err := s.cfg.cnrSource.List()
if err != nil {
s.log.Error(logs.TreeCouldNotFetchContainers, zap.Error(err))
s.metrics.AddSyncDuration(time.Since(start), false)
span.End()
continue
}
@ -390,6 +394,8 @@ func (s *Service) syncLoop(ctx context.Context) {
s.removeContainers(ctx, newMap)
s.log.Debug(logs.TreeTreesHaveBeenSynchronized)
s.metrics.AddSyncDuration(time.Since(start), true)
span.End()
}
}