[#370] Add tree service metrics #388
|
@ -10,6 +10,7 @@ Changelog for FrostFS Node
|
||||||
- Support copies number parameter in `frostfs-cli object put` (#351)
|
- Support copies number parameter in `frostfs-cli object put` (#351)
|
||||||
- Set extra wallets on SIGHUP for ir (#125)
|
- Set extra wallets on SIGHUP for ir (#125)
|
||||||
- Writecache metrics (#312)
|
- Writecache metrics (#312)
|
||||||
|
- Add tree service metrics (#370)
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
- `frostfs-cli util locode generate` is now much faster (#309)
|
- `frostfs-cli util locode generate` is now much faster (#309)
|
||||||
|
|
|
@ -55,7 +55,8 @@ func initTreeService(c *cfg) {
|
||||||
tree.WithContainerCacheSize(treeConfig.CacheSize()),
|
tree.WithContainerCacheSize(treeConfig.CacheSize()),
|
||||||
tree.WithReplicationTimeout(treeConfig.ReplicationTimeout()),
|
tree.WithReplicationTimeout(treeConfig.ReplicationTimeout()),
|
||||||
tree.WithReplicationChannelCapacity(treeConfig.ReplicationChannelCapacity()),
|
tree.WithReplicationChannelCapacity(treeConfig.ReplicationChannelCapacity()),
|
||||||
tree.WithReplicationWorkerCount(treeConfig.ReplicationWorkerCount()))
|
tree.WithReplicationWorkerCount(treeConfig.ReplicationWorkerCount()),
|
||||||
|
tree.WithMetrics(c.metricsCollector.TreeService()))
|
||||||
|
|
||||||
for _, srv := range c.cfgGRPC.servers {
|
for _, srv := range c.cfgGRPC.servers {
|
||||||
tree.RegisterTreeServiceServer(srv, c.treeService)
|
tree.RegisterTreeServiceServer(srv, c.treeService)
|
||||||
|
|
|
@ -1,6 +1,9 @@
|
||||||
package metrics
|
package metrics
|
||||||
|
|
||||||
import "github.com/prometheus/client_golang/prometheus"
|
import (
|
||||||
|
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/tree"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
)
|
||||||
|
|
||||||
const namespace = "frostfs_node"
|
const namespace = "frostfs_node"
|
||||||
|
|
||||||
|
@ -9,9 +12,10 @@ type NodeMetrics struct {
|
||||||
engineMetrics
|
engineMetrics
|
||||||
stateMetrics
|
stateMetrics
|
||||||
replicatorMetrics
|
replicatorMetrics
|
||||||
epoch metric[prometheus.Gauge]
|
|
||||||
|
|||||||
|
|
||||||
writeCacheMetrics *writeCacheMetrics
|
writeCacheMetrics *writeCacheMetrics
|
||||||
|
treeService *treeServiceMetrics
|
||||||
|
epoch metric[prometheus.Gauge]
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewNodeMetrics() *NodeMetrics {
|
func NewNodeMetrics() *NodeMetrics {
|
||||||
|
@ -27,6 +31,9 @@ func NewNodeMetrics() *NodeMetrics {
|
||||||
replicator := newReplicatorMetrics()
|
replicator := newReplicatorMetrics()
|
||||||
replicator.register()
|
replicator.register()
|
||||||
|
|
||||||
|
treeService := newTreeServiceMetrics()
|
||||||
|
treeService.register()
|
||||||
|
|
||||||
epoch := newGauge(prometheus.GaugeOpts{
|
epoch := newGauge(prometheus.GaugeOpts{
|
||||||
Namespace: namespace,
|
Namespace: namespace,
|
||||||
Subsystem: innerRingSubsystem,
|
Subsystem: innerRingSubsystem,
|
||||||
|
@ -43,6 +50,7 @@ func NewNodeMetrics() *NodeMetrics {
|
||||||
engineMetrics: engine,
|
engineMetrics: engine,
|
||||||
stateMetrics: state,
|
stateMetrics: state,
|
||||||
replicatorMetrics: replicator,
|
replicatorMetrics: replicator,
|
||||||
|
treeService: treeService,
|
||||||
epoch: epoch,
|
epoch: epoch,
|
||||||
writeCacheMetrics: writeCacheMetrics,
|
writeCacheMetrics: writeCacheMetrics,
|
||||||
}
|
}
|
||||||
|
@ -60,3 +68,7 @@ func (m *NodeMetrics) WriteCache() WriteCacheMetrics {
|
||||||
}
|
}
|
||||||
return m.writeCacheMetrics
|
return m.writeCacheMetrics
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *NodeMetrics) TreeService() tree.MetricsRegister {
|
||||||
|
return m.treeService
|
||||||
|
}
|
||||||
|
|
64
pkg/metrics/treeservice.go
Normal file
|
@ -0,0 +1,64 @@
|
||||||
|
package metrics
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
)
|
||||||
|
|
||||||
|
const treeServiceLabelSuccess = "success"
|
||||||
|
|
||||||
|
type treeServiceMetrics struct {
|
||||||
|
replicateTaskDuration metric[*prometheus.HistogramVec]
|
||||||
|
replicateWaitDuration metric[*prometheus.HistogramVec]
|
||||||
|
syncOpDuration metric[*prometheus.HistogramVec]
|
||||||
|
}
|
||||||
|
|
||||||
|
func newTreeServiceMetrics() *treeServiceMetrics {
|
||||||
|
const treeServiceSubsystem = "treeservice"
|
||||||
|
return &treeServiceMetrics{
|
||||||
|
replicateTaskDuration: newHistogramVec(prometheus.HistogramOpts{
|
||||||
|
Namespace: namespace,
|
||||||
|
Subsystem: treeServiceSubsystem,
|
||||||
|
Name: "replicate_task_duration_seconds",
|
||||||
|
Help: "Duration of individual replication tasks executed as part of replication loops",
|
||||||
|
}, []string{treeServiceLabelSuccess}),
|
||||||
|
replicateWaitDuration: newHistogramVec(prometheus.HistogramOpts{
|
||||||
|
Namespace: namespace,
|
||||||
|
Subsystem: treeServiceSubsystem,
|
||||||
|
Name: "replicate_wait_duration_seconds",
|
||||||
|
Help: "Duration of overall waiting time for replication loops",
|
||||||
|
}, []string{treeServiceLabelSuccess}),
|
||||||
|
syncOpDuration: newHistogramVec(prometheus.HistogramOpts{
|
||||||
|
Namespace: namespace,
|
||||||
|
Subsystem: treeServiceSubsystem,
|
||||||
|
Name: "sync_duration_seconds",
|
||||||
|
Help: "Duration of synchronization operations",
|
||||||
|
}, []string{treeServiceLabelSuccess}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *treeServiceMetrics) register() {
|
||||||
|
mustRegister(m.replicateTaskDuration)
|
||||||
|
mustRegister(m.replicateWaitDuration)
|
||||||
|
mustRegister(m.syncOpDuration)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *treeServiceMetrics) AddReplicateTaskDuration(d time.Duration, success bool) {
|
||||||
|
m.replicateTaskDuration.value.With(prometheus.Labels{
|
||||||
dstepanov-yadro marked this conversation as resolved
Outdated
dstepanov-yadro
commented
@ale64bit @fyrchik Maybe it's not worth separating operations and errors, but using labels everywhere? In fact, there is an operation that ended with an error or without an error. @ale64bit @fyrchik Maybe it's not worth separating operations and errors, but using labels everywhere? In fact, there is an operation that ended with an error or without an error.
ale64bit
commented
I didn't fully understand what you meant by "In fact, there is an operation that ended with an error or without an error."? I didn't fully understand what you meant by *"In fact, there is an operation that ended with an error or without an error."*?
dstepanov-yadro
commented
I'm sorry, I didn't make myself clear enough.
Operations and errors are counted separately. I think it should be a single counter with label separation. I'm sorry, I didn't make myself clear enough.
This code:
```
case <-s.syncChan:
s.metrics.IncSyncOps()
cnrs, err := s.cfg.cnrSource.List()
if err != nil {
s.metrics.IncSyncErrors()
return
}
```
Operations and errors are counted separately. I think it should be a single counter with label separation.
If we need to display the number of success operations, then we should use such expression: `irate(sync_operation[1m]) - irate(sync_operation_errors[1m])` instead of `irate(sync_operation{success="true"}[1m])`
ale64bit
commented
You mean that we should just use a single You mean that we should just use a single `HistogramVec` for operations in general? (with a `success` label). Since it includes a count of the observations as well.
dstepanov-yadro
commented
yes yes
ale64bit
commented
I agree it's better. Done. I agree it's better. Done.
|
|||||||
|
treeServiceLabelSuccess: fmt.Sprintf("%v", success),
|
||||||
|
}).Observe(d.Seconds())
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *treeServiceMetrics) AddReplicateWaitDuration(d time.Duration, success bool) {
|
||||||
|
m.replicateWaitDuration.value.With(prometheus.Labels{
|
||||||
|
treeServiceLabelSuccess: fmt.Sprintf("%v", success),
|
||||||
|
}).Observe(d.Seconds())
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *treeServiceMetrics) AddSyncDuration(d time.Duration, success bool) {
|
||||||
|
m.syncOpDuration.value.With(prometheus.Labels{
|
||||||
|
treeServiceLabelSuccess: fmt.Sprintf("%v", success),
|
||||||
|
}).Observe(d.Seconds())
|
||||||
|
}
|
15
pkg/services/tree/metrics.go
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
package tree
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
type MetricsRegister interface {
|
||||||
|
AddReplicateTaskDuration(time.Duration, bool)
|
||||||
|
AddReplicateWaitDuration(time.Duration, bool)
|
||||||
|
AddSyncDuration(time.Duration, bool)
|
||||||
|
}
|
||||||
|
|
||||||
|
type defaultMetricsRegister struct{}
|
||||||
|
|
||||||
|
func (defaultMetricsRegister) AddReplicateTaskDuration(time.Duration, bool) {}
|
||||||
|
func (defaultMetricsRegister) AddReplicateWaitDuration(time.Duration, bool) {}
|
||||||
|
func (defaultMetricsRegister) AddSyncDuration(time.Duration, bool) {}
|
|
@ -33,6 +33,8 @@ type cfg struct {
|
||||||
replicatorWorkerCount int
|
replicatorWorkerCount int
|
||||||
replicatorTimeout time.Duration
|
replicatorTimeout time.Duration
|
||||||
containerCacheSize int
|
containerCacheSize int
|
||||||
|
|
||||||
|
metrics MetricsRegister
|
||||||
}
|
}
|
||||||
|
|
||||||
// Option represents configuration option for a tree service.
|
// Option represents configuration option for a tree service.
|
||||||
|
@ -116,3 +118,9 @@ func WithReplicationTimeout(t time.Duration) Option {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func WithMetrics(v MetricsRegister) Option {
|
||||||
|
return func(c *cfg) {
|
||||||
|
c.metrics = v
|
||||||
|
}
|
||||||
fyrchik marked this conversation as resolved
Outdated
fyrchik
commented
What about sane dummy non-null default? What about sane dummy non-null default?
ale64bit
commented
done done
|
|||||||
|
}
|
||||||
|
|
|
@ -75,6 +75,7 @@ func (s *Service) replicationWorker(ctx context.Context) {
|
||||||
attribute.String("public_key", hex.EncodeToString(task.n.PublicKey())),
|
attribute.String("public_key", hex.EncodeToString(task.n.PublicKey())),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
start := time.Now()
|
||||||
|
|
||||||
var lastErr error
|
var lastErr error
|
||||||
var lastAddr string
|
var lastAddr string
|
||||||
|
@ -113,6 +114,9 @@ func (s *Service) replicationWorker(ctx context.Context) {
|
||||||
zap.String("address", lastAddr),
|
zap.String("address", lastAddr),
|
||||||
zap.String("key", hex.EncodeToString(task.n.PublicKey())))
|
zap.String("key", hex.EncodeToString(task.n.PublicKey())))
|
||||||
}
|
}
|
||||||
|
s.metrics.AddReplicateTaskDuration(time.Since(start), false)
|
||||||
|
} else {
|
||||||
|
s.metrics.AddReplicateTaskDuration(time.Since(start), true)
|
||||||
}
|
}
|
||||||
span.End()
|
span.End()
|
||||||
}
|
}
|
||||||
|
@ -137,6 +141,7 @@ func (s *Service) replicateLoop(ctx context.Context) {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return
|
return
|
||||||
case op := <-s.replicateCh:
|
case op := <-s.replicateCh:
|
||||||
|
start := time.Now()
|
||||||
err := s.replicate(op)
|
err := s.replicate(op)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.log.Error(logs.TreeErrorDuringReplication,
|
s.log.Error(logs.TreeErrorDuringReplication,
|
||||||
|
@ -144,6 +149,7 @@ func (s *Service) replicateLoop(ctx context.Context) {
|
||||||
zap.Stringer("cid", op.cid),
|
zap.Stringer("cid", op.cid),
|
||||||
zap.String("treeID", op.treeID))
|
zap.String("treeID", op.treeID))
|
||||||
fyrchik marked this conversation as resolved
Outdated
fyrchik
commented
Just to be aware:
While this metric is useful (how much time do we wait in queue), can we also measure the time spent in Just to be aware: `replicate` does 3 things:
1. Signs the operation.
2. Calculates the list op nodes to replicate to.
3. Sends the request.
While this metric _is_ useful (how much time do we wait in queue), can we also measure the time spent in `replicationWorker`?
ale64bit
commented
done done
|
|||||||
}
|
}
|
||||||
fyrchik marked this conversation as resolved
Outdated
fyrchik
commented
Can we unify 3 lines with Can we unify 3 lines with `s.metrics.AddReplicateWaitDuration(time.Since(start), err == nil)`?
ale64bit
commented
done done
|
|||||||
|
s.metrics.AddReplicateWaitDuration(time.Since(start), err == nil)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -46,6 +46,7 @@ func New(opts ...Option) *Service {
|
||||||
s.replicatorChannelCapacity = defaultReplicatorCapacity
|
s.replicatorChannelCapacity = defaultReplicatorCapacity
|
||||||
s.replicatorWorkerCount = defaultReplicatorWorkerCount
|
s.replicatorWorkerCount = defaultReplicatorWorkerCount
|
||||||
s.replicatorTimeout = defaultReplicatorSendTimeout
|
s.replicatorTimeout = defaultReplicatorSendTimeout
|
||||||
|
s.metrics = defaultMetricsRegister{}
|
||||||
|
|
||||||
for i := range opts {
|
for i := range opts {
|
||||||
opts[i](&s.cfg)
|
opts[i](&s.cfg)
|
||||||
|
|
|
@ -9,6 +9,7 @@ import (
|
||||||
"math"
|
"math"
|
||||||
"math/rand"
|
"math/rand"
|
||||||
"sync"
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
"git.frostfs.info/TrueCloudLab/frostfs-api-go/v2/pkg/tracing"
|
"git.frostfs.info/TrueCloudLab/frostfs-api-go/v2/pkg/tracing"
|
||||||
"git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs"
|
"git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs"
|
||||||
|
@ -376,9 +377,12 @@ func (s *Service) syncLoop(ctx context.Context) {
|
||||||
ctx, span := tracing.StartSpanFromContext(ctx, "TreeService.sync")
|
ctx, span := tracing.StartSpanFromContext(ctx, "TreeService.sync")
|
||||||
s.log.Debug(logs.TreeSyncingTrees)
|
s.log.Debug(logs.TreeSyncingTrees)
|
||||||
fyrchik
commented
ahh ahh
ale64bit
commented
If by "ahh" you mean "I don't like this empty line", I removed it. I wish If by "ahh" you mean "I don't like this empty line", I removed it. I wish `gofmt` would eventually have a say on empty lines, to avoid wasting time discussing them.
fyrchik
commented
It's not about an empty line per se, it's about unrelated changes in the commit. It's not about an empty line per se, it's about unrelated changes in the commit.
|
|||||||
|
|
||||||
|
start := time.Now()
|
||||||
|
|
||||||
cnrs, err := s.cfg.cnrSource.List()
|
cnrs, err := s.cfg.cnrSource.List()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.log.Error(logs.TreeCouldNotFetchContainers, zap.Error(err))
|
s.log.Error(logs.TreeCouldNotFetchContainers, zap.Error(err))
|
||||||
|
s.metrics.AddSyncDuration(time.Since(start), false)
|
||||||
span.End()
|
span.End()
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
@ -390,6 +394,8 @@ func (s *Service) syncLoop(ctx context.Context) {
|
||||||
s.removeContainers(ctx, newMap)
|
s.removeContainers(ctx, newMap)
|
||||||
|
|
||||||
s.log.Debug(logs.TreeTreesHaveBeenSynchronized)
|
s.log.Debug(logs.TreeTreesHaveBeenSynchronized)
|
||||||
|
|
||||||
|
s.metrics.AddSyncDuration(time.Since(start), true)
|
||||||
span.End()
|
span.End()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Why did we move this line?
I moved it to clearly split the embedded fields from the others, including this one.