[#370] Add tree service metrics
All checks were successful
ci/woodpecker/pr/pre-commit Pipeline was successful
All checks were successful
ci/woodpecker/pr/pre-commit Pipeline was successful
Signed-off-by: Alejandro Lopez <a.lopez@yadro.com>
This commit is contained in:
parent
a1823423cf
commit
bfded3a547
9 changed files with 117 additions and 3 deletions
|
@ -10,6 +10,7 @@ Changelog for FrostFS Node
|
|||
- Support copies number parameter in `frostfs-cli object put` (#351)
|
||||
- Set extra wallets on SIGHUP for ir (#125)
|
||||
- Writecache metrics (#312)
|
||||
- Add tree service metrics (#370)
|
||||
|
||||
### Changed
|
||||
- `frostfs-cli util locode generate` is now much faster (#309)
|
||||
|
|
|
@ -55,7 +55,8 @@ func initTreeService(c *cfg) {
|
|||
tree.WithContainerCacheSize(treeConfig.CacheSize()),
|
||||
tree.WithReplicationTimeout(treeConfig.ReplicationTimeout()),
|
||||
tree.WithReplicationChannelCapacity(treeConfig.ReplicationChannelCapacity()),
|
||||
tree.WithReplicationWorkerCount(treeConfig.ReplicationWorkerCount()))
|
||||
tree.WithReplicationWorkerCount(treeConfig.ReplicationWorkerCount()),
|
||||
tree.WithMetrics(c.metricsCollector.TreeService()))
|
||||
|
||||
for _, srv := range c.cfgGRPC.servers {
|
||||
tree.RegisterTreeServiceServer(srv, c.treeService)
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
package metrics
|
||||
|
||||
import "github.com/prometheus/client_golang/prometheus"
|
||||
import (
|
||||
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/tree"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
const namespace = "frostfs_node"
|
||||
|
||||
|
@ -9,9 +12,10 @@ type NodeMetrics struct {
|
|||
engineMetrics
|
||||
stateMetrics
|
||||
replicatorMetrics
|
||||
epoch metric[prometheus.Gauge]
|
||||
|
||||
writeCacheMetrics *writeCacheMetrics
|
||||
treeService *treeServiceMetrics
|
||||
epoch metric[prometheus.Gauge]
|
||||
}
|
||||
|
||||
func NewNodeMetrics() *NodeMetrics {
|
||||
|
@ -27,6 +31,9 @@ func NewNodeMetrics() *NodeMetrics {
|
|||
replicator := newReplicatorMetrics()
|
||||
replicator.register()
|
||||
|
||||
treeService := newTreeServiceMetrics()
|
||||
treeService.register()
|
||||
|
||||
epoch := newGauge(prometheus.GaugeOpts{
|
||||
Namespace: namespace,
|
||||
Subsystem: innerRingSubsystem,
|
||||
|
@ -43,6 +50,7 @@ func NewNodeMetrics() *NodeMetrics {
|
|||
engineMetrics: engine,
|
||||
stateMetrics: state,
|
||||
replicatorMetrics: replicator,
|
||||
treeService: treeService,
|
||||
epoch: epoch,
|
||||
writeCacheMetrics: writeCacheMetrics,
|
||||
}
|
||||
|
@ -60,3 +68,7 @@ func (m *NodeMetrics) WriteCache() WriteCacheMetrics {
|
|||
}
|
||||
return m.writeCacheMetrics
|
||||
}
|
||||
|
||||
func (m *NodeMetrics) TreeService() tree.MetricsRegister {
|
||||
return m.treeService
|
||||
}
|
||||
|
|
64
pkg/metrics/treeservice.go
Normal file
64
pkg/metrics/treeservice.go
Normal file
|
@ -0,0 +1,64 @@
|
|||
package metrics
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
const treeServiceLabelSuccess = "success"
|
||||
|
||||
type treeServiceMetrics struct {
|
||||
replicateTaskDuration metric[*prometheus.HistogramVec]
|
||||
replicateWaitDuration metric[*prometheus.HistogramVec]
|
||||
syncOpDuration metric[*prometheus.HistogramVec]
|
||||
}
|
||||
|
||||
func newTreeServiceMetrics() *treeServiceMetrics {
|
||||
const treeServiceSubsystem = "treeservice"
|
||||
return &treeServiceMetrics{
|
||||
replicateTaskDuration: newHistogramVec(prometheus.HistogramOpts{
|
||||
Namespace: namespace,
|
||||
Subsystem: treeServiceSubsystem,
|
||||
Name: "replicate_task_duration_seconds",
|
||||
Help: "Duration of individual replication tasks executed as part of replication loops",
|
||||
}, []string{treeServiceLabelSuccess}),
|
||||
replicateWaitDuration: newHistogramVec(prometheus.HistogramOpts{
|
||||
Namespace: namespace,
|
||||
Subsystem: treeServiceSubsystem,
|
||||
Name: "replicate_wait_duration_seconds",
|
||||
Help: "Duration of overall waiting time for replication loops",
|
||||
}, []string{treeServiceLabelSuccess}),
|
||||
syncOpDuration: newHistogramVec(prometheus.HistogramOpts{
|
||||
Namespace: namespace,
|
||||
Subsystem: treeServiceSubsystem,
|
||||
Name: "sync_duration_seconds",
|
||||
Help: "Duration of synchronization operations",
|
||||
}, []string{treeServiceLabelSuccess}),
|
||||
}
|
||||
}
|
||||
|
||||
func (m *treeServiceMetrics) register() {
|
||||
mustRegister(m.replicateTaskDuration)
|
||||
mustRegister(m.replicateWaitDuration)
|
||||
mustRegister(m.syncOpDuration)
|
||||
}
|
||||
|
||||
func (m *treeServiceMetrics) AddReplicateTaskDuration(d time.Duration, success bool) {
|
||||
m.replicateTaskDuration.value.With(prometheus.Labels{
|
||||
treeServiceLabelSuccess: fmt.Sprintf("%v", success),
|
||||
}).Observe(d.Seconds())
|
||||
}
|
||||
|
||||
func (m *treeServiceMetrics) AddReplicateWaitDuration(d time.Duration, success bool) {
|
||||
m.replicateWaitDuration.value.With(prometheus.Labels{
|
||||
treeServiceLabelSuccess: fmt.Sprintf("%v", success),
|
||||
}).Observe(d.Seconds())
|
||||
}
|
||||
|
||||
func (m *treeServiceMetrics) AddSyncDuration(d time.Duration, success bool) {
|
||||
m.syncOpDuration.value.With(prometheus.Labels{
|
||||
treeServiceLabelSuccess: fmt.Sprintf("%v", success),
|
||||
}).Observe(d.Seconds())
|
||||
}
|
15
pkg/services/tree/metrics.go
Normal file
15
pkg/services/tree/metrics.go
Normal file
|
@ -0,0 +1,15 @@
|
|||
package tree
|
||||
|
||||
import "time"
|
||||
|
||||
type MetricsRegister interface {
|
||||
AddReplicateTaskDuration(time.Duration, bool)
|
||||
AddReplicateWaitDuration(time.Duration, bool)
|
||||
AddSyncDuration(time.Duration, bool)
|
||||
}
|
||||
|
||||
type defaultMetricsRegister struct{}
|
||||
|
||||
func (defaultMetricsRegister) AddReplicateTaskDuration(time.Duration, bool) {}
|
||||
func (defaultMetricsRegister) AddReplicateWaitDuration(time.Duration, bool) {}
|
||||
func (defaultMetricsRegister) AddSyncDuration(time.Duration, bool) {}
|
|
@ -33,6 +33,8 @@ type cfg struct {
|
|||
replicatorWorkerCount int
|
||||
replicatorTimeout time.Duration
|
||||
containerCacheSize int
|
||||
|
||||
metrics MetricsRegister
|
||||
}
|
||||
|
||||
// Option represents configuration option for a tree service.
|
||||
|
@ -116,3 +118,9 @@ func WithReplicationTimeout(t time.Duration) Option {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
func WithMetrics(v MetricsRegister) Option {
|
||||
return func(c *cfg) {
|
||||
c.metrics = v
|
||||
}
|
||||
}
|
||||
|
|
|
@ -75,6 +75,7 @@ func (s *Service) replicationWorker(ctx context.Context) {
|
|||
attribute.String("public_key", hex.EncodeToString(task.n.PublicKey())),
|
||||
),
|
||||
)
|
||||
start := time.Now()
|
||||
|
||||
var lastErr error
|
||||
var lastAddr string
|
||||
|
@ -113,6 +114,9 @@ func (s *Service) replicationWorker(ctx context.Context) {
|
|||
zap.String("address", lastAddr),
|
||||
zap.String("key", hex.EncodeToString(task.n.PublicKey())))
|
||||
}
|
||||
s.metrics.AddReplicateTaskDuration(time.Since(start), false)
|
||||
} else {
|
||||
s.metrics.AddReplicateTaskDuration(time.Since(start), true)
|
||||
}
|
||||
span.End()
|
||||
}
|
||||
|
@ -137,6 +141,7 @@ func (s *Service) replicateLoop(ctx context.Context) {
|
|||
case <-ctx.Done():
|
||||
return
|
||||
case op := <-s.replicateCh:
|
||||
start := time.Now()
|
||||
err := s.replicate(op)
|
||||
if err != nil {
|
||||
s.log.Error(logs.TreeErrorDuringReplication,
|
||||
|
@ -144,6 +149,7 @@ func (s *Service) replicateLoop(ctx context.Context) {
|
|||
zap.Stringer("cid", op.cid),
|
||||
zap.String("treeID", op.treeID))
|
||||
}
|
||||
s.metrics.AddReplicateWaitDuration(time.Since(start), err == nil)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -46,6 +46,7 @@ func New(opts ...Option) *Service {
|
|||
s.replicatorChannelCapacity = defaultReplicatorCapacity
|
||||
s.replicatorWorkerCount = defaultReplicatorWorkerCount
|
||||
s.replicatorTimeout = defaultReplicatorSendTimeout
|
||||
s.metrics = defaultMetricsRegister{}
|
||||
|
||||
for i := range opts {
|
||||
opts[i](&s.cfg)
|
||||
|
|
|
@ -9,6 +9,7 @@ import (
|
|||
"math"
|
||||
"math/rand"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"git.frostfs.info/TrueCloudLab/frostfs-api-go/v2/pkg/tracing"
|
||||
"git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs"
|
||||
|
@ -376,9 +377,12 @@ func (s *Service) syncLoop(ctx context.Context) {
|
|||
ctx, span := tracing.StartSpanFromContext(ctx, "TreeService.sync")
|
||||
s.log.Debug(logs.TreeSyncingTrees)
|
||||
|
||||
start := time.Now()
|
||||
|
||||
cnrs, err := s.cfg.cnrSource.List()
|
||||
if err != nil {
|
||||
s.log.Error(logs.TreeCouldNotFetchContainers, zap.Error(err))
|
||||
s.metrics.AddSyncDuration(time.Since(start), false)
|
||||
span.End()
|
||||
continue
|
||||
}
|
||||
|
@ -390,6 +394,8 @@ func (s *Service) syncLoop(ctx context.Context) {
|
|||
s.removeContainers(ctx, newMap)
|
||||
|
||||
s.log.Debug(logs.TreeTreesHaveBeenSynchronized)
|
||||
|
||||
s.metrics.AddSyncDuration(time.Since(start), true)
|
||||
span.End()
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue