forked from TrueCloudLab/frostfs-node
[#370] Add tree service metrics
Signed-off-by: Alejandro Lopez <a.lopez@yadro.com>
This commit is contained in:
parent
f2e5dead7e
commit
bc34fee6a7
9 changed files with 117 additions and 3 deletions
|
@ -10,6 +10,7 @@ Changelog for FrostFS Node
|
||||||
- Support copies number parameter in `frostfs-cli object put` (#351)
|
- Support copies number parameter in `frostfs-cli object put` (#351)
|
||||||
- Set extra wallets on SIGHUP for ir (#125)
|
- Set extra wallets on SIGHUP for ir (#125)
|
||||||
- Writecache metrics (#312)
|
- Writecache metrics (#312)
|
||||||
|
- Add tree service metrics (#370)
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
- `frostfs-cli util locode generate` is now much faster (#309)
|
- `frostfs-cli util locode generate` is now much faster (#309)
|
||||||
|
|
|
@ -55,7 +55,8 @@ func initTreeService(c *cfg) {
|
||||||
tree.WithContainerCacheSize(treeConfig.CacheSize()),
|
tree.WithContainerCacheSize(treeConfig.CacheSize()),
|
||||||
tree.WithReplicationTimeout(treeConfig.ReplicationTimeout()),
|
tree.WithReplicationTimeout(treeConfig.ReplicationTimeout()),
|
||||||
tree.WithReplicationChannelCapacity(treeConfig.ReplicationChannelCapacity()),
|
tree.WithReplicationChannelCapacity(treeConfig.ReplicationChannelCapacity()),
|
||||||
tree.WithReplicationWorkerCount(treeConfig.ReplicationWorkerCount()))
|
tree.WithReplicationWorkerCount(treeConfig.ReplicationWorkerCount()),
|
||||||
|
tree.WithMetrics(c.metricsCollector.TreeService()))
|
||||||
|
|
||||||
for _, srv := range c.cfgGRPC.servers {
|
for _, srv := range c.cfgGRPC.servers {
|
||||||
tree.RegisterTreeServiceServer(srv, c.treeService)
|
tree.RegisterTreeServiceServer(srv, c.treeService)
|
||||||
|
|
|
@ -1,6 +1,9 @@
|
||||||
package metrics
|
package metrics
|
||||||
|
|
||||||
import "github.com/prometheus/client_golang/prometheus"
|
import (
|
||||||
|
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/tree"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
)
|
||||||
|
|
||||||
const namespace = "frostfs_node"
|
const namespace = "frostfs_node"
|
||||||
|
|
||||||
|
@ -9,9 +12,10 @@ type NodeMetrics struct {
|
||||||
engineMetrics
|
engineMetrics
|
||||||
stateMetrics
|
stateMetrics
|
||||||
replicatorMetrics
|
replicatorMetrics
|
||||||
epoch metric[prometheus.Gauge]
|
|
||||||
|
|
||||||
writeCacheMetrics *writeCacheMetrics
|
writeCacheMetrics *writeCacheMetrics
|
||||||
|
treeService *treeServiceMetrics
|
||||||
|
epoch metric[prometheus.Gauge]
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewNodeMetrics() *NodeMetrics {
|
func NewNodeMetrics() *NodeMetrics {
|
||||||
|
@ -27,6 +31,9 @@ func NewNodeMetrics() *NodeMetrics {
|
||||||
replicator := newReplicatorMetrics()
|
replicator := newReplicatorMetrics()
|
||||||
replicator.register()
|
replicator.register()
|
||||||
|
|
||||||
|
treeService := newTreeServiceMetrics()
|
||||||
|
treeService.register()
|
||||||
|
|
||||||
epoch := newGauge(prometheus.GaugeOpts{
|
epoch := newGauge(prometheus.GaugeOpts{
|
||||||
Namespace: namespace,
|
Namespace: namespace,
|
||||||
Subsystem: innerRingSubsystem,
|
Subsystem: innerRingSubsystem,
|
||||||
|
@ -43,6 +50,7 @@ func NewNodeMetrics() *NodeMetrics {
|
||||||
engineMetrics: engine,
|
engineMetrics: engine,
|
||||||
stateMetrics: state,
|
stateMetrics: state,
|
||||||
replicatorMetrics: replicator,
|
replicatorMetrics: replicator,
|
||||||
|
treeService: treeService,
|
||||||
epoch: epoch,
|
epoch: epoch,
|
||||||
writeCacheMetrics: writeCacheMetrics,
|
writeCacheMetrics: writeCacheMetrics,
|
||||||
}
|
}
|
||||||
|
@ -60,3 +68,7 @@ func (m *NodeMetrics) WriteCache() WriteCacheMetrics {
|
||||||
}
|
}
|
||||||
return m.writeCacheMetrics
|
return m.writeCacheMetrics
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *NodeMetrics) TreeService() tree.MetricsRegister {
|
||||||
|
return m.treeService
|
||||||
|
}
|
||||||
|
|
64
pkg/metrics/treeservice.go
Normal file
64
pkg/metrics/treeservice.go
Normal file
|
@ -0,0 +1,64 @@
|
||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
)
|
||||||
|
|
||||||
|
const treeServiceLabelSuccess = "success"
|
||||||
|
|
||||||
|
type treeServiceMetrics struct {
|
||||||
|
replicateTaskDuration metric[*prometheus.HistogramVec]
|
||||||
|
replicateWaitDuration metric[*prometheus.HistogramVec]
|
||||||
|
syncOpDuration metric[*prometheus.HistogramVec]
|
||||||
|
}
|
||||||
|
|
||||||
|
func newTreeServiceMetrics() *treeServiceMetrics {
|
||||||
|
const treeServiceSubsystem = "treeservice"
|
||||||
|
return &treeServiceMetrics{
|
||||||
|
replicateTaskDuration: newHistogramVec(prometheus.HistogramOpts{
|
||||||
|
Namespace: namespace,
|
||||||
|
Subsystem: treeServiceSubsystem,
|
||||||
|
Name: "replicate_task_duration_seconds",
|
||||||
|
Help: "Duration of individual replication tasks executed as part of replication loops",
|
||||||
|
}, []string{treeServiceLabelSuccess}),
|
||||||
|
replicateWaitDuration: newHistogramVec(prometheus.HistogramOpts{
|
||||||
|
Namespace: namespace,
|
||||||
|
Subsystem: treeServiceSubsystem,
|
||||||
|
Name: "replicate_wait_duration_seconds",
|
||||||
|
Help: "Duration of overall waiting time for replication loops",
|
||||||
|
}, []string{treeServiceLabelSuccess}),
|
||||||
|
syncOpDuration: newHistogramVec(prometheus.HistogramOpts{
|
||||||
|
Namespace: namespace,
|
||||||
|
Subsystem: treeServiceSubsystem,
|
||||||
|
Name: "sync_duration_seconds",
|
||||||
|
Help: "Duration of synchronization operations",
|
||||||
|
}, []string{treeServiceLabelSuccess}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *treeServiceMetrics) register() {
|
||||||
|
mustRegister(m.replicateTaskDuration)
|
||||||
|
mustRegister(m.replicateWaitDuration)
|
||||||
|
mustRegister(m.syncOpDuration)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *treeServiceMetrics) AddReplicateTaskDuration(d time.Duration, success bool) {
|
||||||
|
m.replicateTaskDuration.value.With(prometheus.Labels{
|
||||||
|
treeServiceLabelSuccess: fmt.Sprintf("%v", success),
|
||||||
|
}).Observe(d.Seconds())
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *treeServiceMetrics) AddReplicateWaitDuration(d time.Duration, success bool) {
|
||||||
|
m.replicateWaitDuration.value.With(prometheus.Labels{
|
||||||
|
treeServiceLabelSuccess: fmt.Sprintf("%v", success),
|
||||||
|
}).Observe(d.Seconds())
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *treeServiceMetrics) AddSyncDuration(d time.Duration, success bool) {
|
||||||
|
m.syncOpDuration.value.With(prometheus.Labels{
|
||||||
|
treeServiceLabelSuccess: fmt.Sprintf("%v", success),
|
||||||
|
}).Observe(d.Seconds())
|
||||||
|
}
|
15
pkg/services/tree/metrics.go
Normal file
15
pkg/services/tree/metrics.go
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
package tree
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
type MetricsRegister interface {
|
||||||
|
AddReplicateTaskDuration(time.Duration, bool)
|
||||||
|
AddReplicateWaitDuration(time.Duration, bool)
|
||||||
|
AddSyncDuration(time.Duration, bool)
|
||||||
|
}
|
||||||
|
|
||||||
|
type defaultMetricsRegister struct{}
|
||||||
|
|
||||||
|
func (defaultMetricsRegister) AddReplicateTaskDuration(time.Duration, bool) {}
|
||||||
|
func (defaultMetricsRegister) AddReplicateWaitDuration(time.Duration, bool) {}
|
||||||
|
func (defaultMetricsRegister) AddSyncDuration(time.Duration, bool) {}
|
|
@ -33,6 +33,8 @@ type cfg struct {
|
||||||
replicatorWorkerCount int
|
replicatorWorkerCount int
|
||||||
replicatorTimeout time.Duration
|
replicatorTimeout time.Duration
|
||||||
containerCacheSize int
|
containerCacheSize int
|
||||||
|
|
||||||
|
metrics MetricsRegister
|
||||||
}
|
}
|
||||||
|
|
||||||
// Option represents configuration option for a tree service.
|
// Option represents configuration option for a tree service.
|
||||||
|
@ -116,3 +118,9 @@ func WithReplicationTimeout(t time.Duration) Option {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func WithMetrics(v MetricsRegister) Option {
|
||||||
|
return func(c *cfg) {
|
||||||
|
c.metrics = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -75,6 +75,7 @@ func (s *Service) replicationWorker(ctx context.Context) {
|
||||||
attribute.String("public_key", hex.EncodeToString(task.n.PublicKey())),
|
attribute.String("public_key", hex.EncodeToString(task.n.PublicKey())),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
start := time.Now()
|
||||||
|
|
||||||
var lastErr error
|
var lastErr error
|
||||||
var lastAddr string
|
var lastAddr string
|
||||||
|
@ -113,6 +114,9 @@ func (s *Service) replicationWorker(ctx context.Context) {
|
||||||
zap.String("address", lastAddr),
|
zap.String("address", lastAddr),
|
||||||
zap.String("key", hex.EncodeToString(task.n.PublicKey())))
|
zap.String("key", hex.EncodeToString(task.n.PublicKey())))
|
||||||
}
|
}
|
||||||
|
s.metrics.AddReplicateTaskDuration(time.Since(start), false)
|
||||||
|
} else {
|
||||||
|
s.metrics.AddReplicateTaskDuration(time.Since(start), true)
|
||||||
}
|
}
|
||||||
span.End()
|
span.End()
|
||||||
}
|
}
|
||||||
|
@ -137,6 +141,7 @@ func (s *Service) replicateLoop(ctx context.Context) {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return
|
return
|
||||||
case op := <-s.replicateCh:
|
case op := <-s.replicateCh:
|
||||||
|
start := time.Now()
|
||||||
err := s.replicate(op)
|
err := s.replicate(op)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.log.Error(logs.TreeErrorDuringReplication,
|
s.log.Error(logs.TreeErrorDuringReplication,
|
||||||
|
@ -144,6 +149,7 @@ func (s *Service) replicateLoop(ctx context.Context) {
|
||||||
zap.Stringer("cid", op.cid),
|
zap.Stringer("cid", op.cid),
|
||||||
zap.String("treeID", op.treeID))
|
zap.String("treeID", op.treeID))
|
||||||
}
|
}
|
||||||
|
s.metrics.AddReplicateWaitDuration(time.Since(start), err == nil)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -46,6 +46,7 @@ func New(opts ...Option) *Service {
|
||||||
s.replicatorChannelCapacity = defaultReplicatorCapacity
|
s.replicatorChannelCapacity = defaultReplicatorCapacity
|
||||||
s.replicatorWorkerCount = defaultReplicatorWorkerCount
|
s.replicatorWorkerCount = defaultReplicatorWorkerCount
|
||||||
s.replicatorTimeout = defaultReplicatorSendTimeout
|
s.replicatorTimeout = defaultReplicatorSendTimeout
|
||||||
|
s.metrics = defaultMetricsRegister{}
|
||||||
|
|
||||||
for i := range opts {
|
for i := range opts {
|
||||||
opts[i](&s.cfg)
|
opts[i](&s.cfg)
|
||||||
|
|
|
@ -9,6 +9,7 @@ import (
|
||||||
"math"
|
"math"
|
||||||
"math/rand"
|
"math/rand"
|
||||||
"sync"
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
"git.frostfs.info/TrueCloudLab/frostfs-api-go/v2/pkg/tracing"
|
"git.frostfs.info/TrueCloudLab/frostfs-api-go/v2/pkg/tracing"
|
||||||
"git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs"
|
"git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs"
|
||||||
|
@ -376,9 +377,12 @@ func (s *Service) syncLoop(ctx context.Context) {
|
||||||
ctx, span := tracing.StartSpanFromContext(ctx, "TreeService.sync")
|
ctx, span := tracing.StartSpanFromContext(ctx, "TreeService.sync")
|
||||||
s.log.Debug(logs.TreeSyncingTrees)
|
s.log.Debug(logs.TreeSyncingTrees)
|
||||||
|
|
||||||
|
start := time.Now()
|
||||||
|
|
||||||
cnrs, err := s.cfg.cnrSource.List()
|
cnrs, err := s.cfg.cnrSource.List()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.log.Error(logs.TreeCouldNotFetchContainers, zap.Error(err))
|
s.log.Error(logs.TreeCouldNotFetchContainers, zap.Error(err))
|
||||||
|
s.metrics.AddSyncDuration(time.Since(start), false)
|
||||||
span.End()
|
span.End()
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
@ -390,6 +394,8 @@ func (s *Service) syncLoop(ctx context.Context) {
|
||||||
s.removeContainers(ctx, newMap)
|
s.removeContainers(ctx, newMap)
|
||||||
|
|
||||||
s.log.Debug(logs.TreeTreesHaveBeenSynchronized)
|
s.log.Debug(logs.TreeTreesHaveBeenSynchronized)
|
||||||
|
|
||||||
|
s.metrics.AddSyncDuration(time.Since(start), true)
|
||||||
span.End()
|
span.End()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue