[#371] ir: Add morph cache metrics #400

Merged
fyrchik merged 1 commit from aarifullin/frostfs-node:feature/371-morph_cache_metr into master 2023-06-13 10:10:16 +00:00
11 changed files with 154 additions and 27 deletions

View file

@ -469,11 +469,12 @@ func (s *Server) initMorph(ctx context.Context, cfg *viper.Viper, errChan chan<-
} }
morphChain := &chainParams{ morphChain := &chainParams{
log: s.log, log: s.log,
cfg: cfg, cfg: cfg,
key: s.key, key: s.key,
name: morphPrefix, name: morphPrefix,
from: fromSideChainBlock, from: fromSideChainBlock,
morphCacheMetric: s.metrics.MorphCacheMetrics(),
} }
// create morph client // create morph client

View file

@ -102,12 +102,13 @@ type (
} }
chainParams struct { chainParams struct {
log *logger.Logger log *logger.Logger
cfg *viper.Viper cfg *viper.Viper
key *keys.PrivateKey key *keys.PrivateKey
name string name string
sgn *transaction.Signer sgn *transaction.Signer
from uint32 // block height from uint32 // block height
morphCacheMetric metrics.MorphCacheMetrics
} }
) )
@ -465,6 +466,7 @@ func createClient(ctx context.Context, p *chainParams, errChan chan<- error) (*c
errChan <- fmt.Errorf("%s chain connection has been lost", p.name) errChan <- fmt.Errorf("%s chain connection has been lost", p.name)
}), }),
client.WithSwitchInterval(p.cfg.GetDuration(p.name+".switch_interval")), client.WithSwitchInterval(p.cfg.GetDuration(p.name+".switch_interval")),
client.WithMorphCacheMetrics(p.morphCacheMetric),
) )
} }

View file

@ -16,9 +16,10 @@ const (
// InnerRingServiceMetrics contains metrics collected by inner ring. // InnerRingServiceMetrics contains metrics collected by inner ring.
type InnerRingServiceMetrics struct { type InnerRingServiceMetrics struct {
epoch prometheus.Gauge epoch prometheus.Gauge
health prometheus.Gauge health prometheus.Gauge
eventDuration *prometheus.HistogramVec eventDuration *prometheus.HistogramVec
morphCacheMetrics *morphCacheMetrics
} }
// NewInnerRingMetrics returns new instance of metrics collectors for inner ring. // NewInnerRingMetrics returns new instance of metrics collectors for inner ring.
@ -45,9 +46,10 @@ func NewInnerRingMetrics() *InnerRingServiceMetrics {
) )
return &InnerRingServiceMetrics{ return &InnerRingServiceMetrics{
epoch: epoch, epoch: epoch,
health: health, health: health,
eventDuration: eventDuration, eventDuration: eventDuration,
morphCacheMetrics: newMorphCacheMetrics(),
} }
} }
@ -67,3 +69,7 @@ func (m InnerRingServiceMetrics) AddEvent(d time.Duration, typ string, success b
innerRingLabelSuccess: strconv.FormatBool(success), innerRingLabelSuccess: strconv.FormatBool(success),
}).Observe(d.Seconds()) }).Observe(d.Seconds())
} }
func (m InnerRingServiceMetrics) MorphCacheMetrics() MorphCacheMetrics {
return m.morphCacheMetrics
}

77
pkg/metrics/morphcache.go Normal file
View file

@ -0,0 +1,77 @@
package metrics
import (
"fmt"
"time"
"git.frostfs.info/TrueCloudLab/frostfs-observability/metrics"
"github.com/nspcc-dev/neo-go/pkg/util"
"github.com/prometheus/client_golang/prometheus"
)
const (
mcSubsystem = "morphcache"
mcSuccess = "success"
)
type MorphCacheMetrics interface {
fyrchik marked this conversation as resolved Outdated

Is it used anywhere?

Is it used anywhere?
AddNNSContractHashDuration(success bool, d time.Duration)
AddGroupKeyDuration(success bool, d time.Duration)
AddTxHeightDuration(hash util.Uint256, success bool, d time.Duration)
}
type morphCacheMetrics struct {
// Duration of processing get nns contract hash request
nnsContractHashDuration *prometheus.HistogramVec
// Duration of processing get group key request
groupKeyDuration *prometheus.HistogramVec
// Duration of processing get tx height request
txHeightDuration *prometheus.HistogramVec
}
dstepanov-yadro marked this conversation as resolved Outdated

According to this #395 - is it necessary for counter here too?

According to this #395 - is it necessary for counter here too?

Yeah, I think so. These will be removed

Yeah, I think so. These will be removed

I checked out @dstepanov-yadro 's PRs and his implementation for writeCache.

You can look at these lines and this writeCache' feature - you can see that cache uses db and fsTree.

I need to separate flags - cache_hit and found, because we can encounter these situations:

  1. cache hit -> object found (true, true)
  2. cache miss but this miss is being processed and object is found (false, true)
  3. cache miss, object has not been found at all - (false, false)

If I use only success flag like this, then we will never know was it cache miss or error during processing cache miss

@dstepanov-yadro , please, fix me, if I am not correct

I checked out @dstepanov-yadro 's PRs and his implementation for `writeCache`. You can look at these [lines](https://git.frostfs.info/TrueCloudLab/frostfs-node/src/branch/master/pkg/local_object_storage/writecache/get.go#L39) and this `writeCache`' feature - you can see that cache uses `db` and `fsTree`. I need to separate flags - `cache_hit` and `found`, because we can encounter these situations: 1. cache hit -> object found (true, true) 2. cache miss but this miss is being processed and object is found (false, true) 3. cache miss, object has not been found at all - (false, false) If I use only `success` flag like [this](https://git.frostfs.info/TrueCloudLab/frostfs-node/src/branch/master/pkg/local_object_storage/writecache/get.go#L39), then we will never know was it cache miss or error during processing cache miss @dstepanov-yadro , please, fix me, if I am not correct

I think than @acid-ant suggests using getNNSContractHashDuration metric[*prometheus.HistogramVec] with two headers: cache_hit and success.

I think than @acid-ant suggests using `getNNSContractHashDuration metric[*prometheus.HistogramVec]` with two headers: cache_hit and success.

with two headers: cache_hit and success

I've introduced cache_hit = true/false and found = true/false. I find the success label inconvenient because it's not verbose from my POV :)

P.S. Replaced it to success anyway

> with two headers: cache_hit and success I've introduced `cache_hit = true/false` and `found = true/false`. I find the `success` label inconvenient because it's not verbose from my POV :) **P.S. Replaced it to `success` anyway**
var _ MorphCacheMetrics = (*morphCacheMetrics)(nil)
func newMorphCacheMetrics() *morphCacheMetrics {
return &morphCacheMetrics{
nnsContractHashDuration: newMCMethodDurationCounter("nns_contract_hash"),
groupKeyDuration: newMCMethodDurationCounter("group_key"),
txHeightDuration: newMCMethodDurationCounter("tx_height"),
}

See https://prometheus.io/docs/practices/naming/

A metric name... ...should have a suffix describing the unit, in plural form. ...
See https://prometheus.io/docs/practices/naming/ ``` A metric name... ...should have a suffix describing the unit, in plural form. ... ```

Oh, sorry... Fixed!

Oh, sorry... Fixed!
}
func (m *morphCacheMetrics) AddNNSContractHashDuration(success bool, d time.Duration) {
m.nnsContractHashDuration.With(
prometheus.Labels{
mcSuccess: fmt.Sprintf("%v", success),
},
).Observe(float64(d))
}
func (m *morphCacheMetrics) AddGroupKeyDuration(success bool, d time.Duration) {
m.groupKeyDuration.With(
prometheus.Labels{
mcSuccess: fmt.Sprintf("%v", success),
},
).Observe(float64(d))
}

Is mcTxHash countable? If not we will get a lot of metric records.

Is mcTxHash countable? If not we will get a lot of metric records.

Alright, probably I misunderstood the meaing of the labels. I've fixed!

Alright, probably I misunderstood the meaing of the labels. I've fixed!
func (m *morphCacheMetrics) AddTxHeightDuration(hash util.Uint256, success bool, d time.Duration) {
m.txHeightDuration.With(
prometheus.Labels{
mcSuccess: fmt.Sprintf("%v", success),
},
).Observe(float64(d))

I was thinking is it worth it to add the nns contract hash label but after all I concluded that it's not helpful information from the point of metric calculation.

The second point - the events like getting NNS contract hash and group key are rare compared to getting tx height but I've added them anyway

I was thinking is it worth it to add the nns contract hash label but after all I concluded that it's not helpful information from the point of metric calculation. The second point - the events like getting NNS contract hash and group key are rare compared to getting tx height but I've added them anyway
}
func newMCMethodDurationCounter(method string) *prometheus.HistogramVec {
return metrics.NewHistogramVec(prometheus.HistogramOpts{
Namespace: namespace,
Subsystem: mcSubsystem,
Name: fmt.Sprintf("%s_req_duration_seconds", method),
Help: fmt.Sprintf("Accumulated %s request process duration", method),
}, []string{mcSuccess})
}

View file

@ -1,7 +1,6 @@
package metrics package metrics
import ( import (
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/services/tree"
"git.frostfs.info/TrueCloudLab/frostfs-observability/metrics" "git.frostfs.info/TrueCloudLab/frostfs-observability/metrics"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
) )
@ -67,7 +66,7 @@ func (m *NodeMetrics) WriteCache() WriteCacheMetrics {
return m.writeCacheMetrics return m.writeCacheMetrics
} }
func (m *NodeMetrics) TreeService() tree.MetricsRegister { func (m *NodeMetrics) TreeService() TreeMetricsRegister {
return m.treeService return m.treeService
} }

View file

@ -10,12 +10,20 @@ import (
const treeServiceLabelSuccess = "success" const treeServiceLabelSuccess = "success"

tree.MetricsRegister has been moved from pkg/services/tree/metrics.go to here and renamed because this caused import cycle

`tree.MetricsRegister` has been moved from `pkg/services/tree/metrics.go` to here and renamed because this caused import cycle
type TreeMetricsRegister interface {
AddReplicateTaskDuration(time.Duration, bool)
AddReplicateWaitDuration(time.Duration, bool)
AddSyncDuration(time.Duration, bool)
}
type treeServiceMetrics struct { type treeServiceMetrics struct {
replicateTaskDuration *prometheus.HistogramVec replicateTaskDuration *prometheus.HistogramVec
replicateWaitDuration *prometheus.HistogramVec replicateWaitDuration *prometheus.HistogramVec
syncOpDuration *prometheus.HistogramVec syncOpDuration *prometheus.HistogramVec
} }
var _ TreeMetricsRegister = (*treeServiceMetrics)(nil)
func newTreeServiceMetrics() *treeServiceMetrics { func newTreeServiceMetrics() *treeServiceMetrics {
const treeServiceSubsystem = "treeservice" const treeServiceSubsystem = "treeservice"
return &treeServiceMetrics{ return &treeServiceMetrics{

View file

@ -10,6 +10,7 @@ import (
"time" "time"
"git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs" "git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/metrics"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/logger" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/logger"
lru "github.com/hashicorp/golang-lru/v2" lru "github.com/hashicorp/golang-lru/v2"
"github.com/nspcc-dev/neo-go/pkg/core/native/noderoles" "github.com/nspcc-dev/neo-go/pkg/core/native/noderoles"
@ -88,6 +89,8 @@ type cache struct {
nnsHash *util.Uint160 nnsHash *util.Uint160
gKey *keys.PublicKey gKey *keys.PublicKey
txHeights *lru.Cache[util.Uint256, uint32] txHeights *lru.Cache[util.Uint256, uint32]
metrics metrics.MorphCacheMetrics
fyrchik marked this conversation as resolved Outdated

It resides in morph/Client.cache, may be call the field just "metrics"?

It resides in `morph/Client.cache`, may be call the field just "metrics"?
} }
func (c *cache) nns() *util.Uint160 { func (c *cache) nns() *util.Uint160 {

View file

@ -7,6 +7,7 @@ import (
"time" "time"
"git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs" "git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/metrics"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/logger" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/logger"
lru "github.com/hashicorp/golang-lru/v2" lru "github.com/hashicorp/golang-lru/v2"
"github.com/nspcc-dev/neo-go/pkg/core/transaction" "github.com/nspcc-dev/neo-go/pkg/core/transaction"
@ -42,6 +43,8 @@ type cfg struct {
inactiveModeCb Callback inactiveModeCb Callback
switchInterval time.Duration switchInterval time.Duration
morphCacheMetrics metrics.MorphCacheMetrics
} }
const ( const (
@ -104,7 +107,7 @@ func New(ctx context.Context, key *keys.PrivateKey, opts ...Option) (*Client, er
} }
cli := &Client{ cli := &Client{
cache: newClientCache(), cache: newClientCache(cfg.morphCacheMetrics),
logger: cfg.logger, logger: cfg.logger,
acc: acc, acc: acc,
accAddr: accAddr, accAddr: accAddr,
@ -195,10 +198,11 @@ func newActor(ws *rpcclient.WSClient, acc *wallet.Account, cfg cfg) (*actor.Acto
}}) }})
} }
func newClientCache() cache { func newClientCache(morphCacheMetrics metrics.MorphCacheMetrics) cache {
c, _ := lru.New[util.Uint256, uint32](100) // returns error only if size is negative c, _ := lru.New[util.Uint256, uint32](100) // returns error only if size is negative
return cache{ return cache{
txHeights: c, txHeights: c,
metrics: morphCacheMetrics,
} }
} }
@ -282,3 +286,9 @@ func WithSwitchInterval(i time.Duration) Option {
c.switchInterval = i c.switchInterval = i
} }
} }
func WithMorphCacheMetrics(morphCacheMetrics metrics.MorphCacheMetrics) Option {
return func(c *cfg) {
c.morphCacheMetrics = morphCacheMetrics
}
}

View file

@ -5,6 +5,7 @@ import (
"fmt" "fmt"
"math/big" "math/big"
"strconv" "strconv"
"time"
"git.frostfs.info/TrueCloudLab/frostfs-contract/nns" "git.frostfs.info/TrueCloudLab/frostfs-contract/nns"
"github.com/nspcc-dev/neo-go/pkg/core/transaction" "github.com/nspcc-dev/neo-go/pkg/core/transaction"
@ -81,6 +82,13 @@ func (c *Client) NNSHash() (util.Uint160, error) {
return util.Uint160{}, ErrConnectionLost return util.Uint160{}, ErrConnectionLost
} }
success := false
startedAt := time.Now()
defer func() {
c.cache.metrics.AddNNSContractHashDuration(success, time.Since(startedAt))
}()
fyrchik marked this conversation as resolved Outdated

The comment says // Duration of processing get nns contract hash request if cache miss happens, but this line is executed unconditionally. Whom to believe?

The comment says `// Duration of processing get nns contract hash request if cache miss happens`, but this line is executed unconditionally. Whom to believe?
nnsHash := c.cache.nns() nnsHash := c.cache.nns()
if nnsHash == nil { if nnsHash == nil {
@ -92,6 +100,7 @@ func (c *Client) NNSHash() (util.Uint160, error) {
c.cache.setNNSHash(cs.Hash) c.cache.setNNSHash(cs.Hash)
nnsHash = &cs.Hash nnsHash = &cs.Hash
} }
success = true
return *nnsHash, nil return *nnsHash, nil
} }
@ -221,7 +230,14 @@ func (c *Client) SetGroupSignerScope() error {
// contractGroupKey returns public key designating FrostFS contract group. // contractGroupKey returns public key designating FrostFS contract group.
func (c *Client) contractGroupKey() (*keys.PublicKey, error) { func (c *Client) contractGroupKey() (*keys.PublicKey, error) {
success := false
startedAt := time.Now()
defer func() {
c.cache.metrics.AddGroupKeyDuration(success, time.Since(startedAt))

We use this found as a success indicator, what about success?

We use this `found` as a success indicator, what about `success`?

Alright. I give up :) It seems the majority wants to see success flag - I'll fix it.

For me success label doesn't fit well with metric name.

Let's consider get_tx_height/tx_height metric. How does it look in the histogram with label success:

  • Was it success = true because cache has hit OR because cache has missed and we processed this situation and successfully found the requested?
  • Was it success = false because cache has missed OR because cache has missed and we processed this situation and unsuccessfully found the requested?

These two points were in my mind...

P.S. Okay. Agreed. The same can bee applied for found label

Alright. I give up :) It seems the majority wants to see `success` flag - I'll fix it. For me `success` label doesn't fit well with metric name. Let's consider `get_tx_height`/`tx_height` metric. How does it look in the histogram with label `success`: - Was it `success = true` because cache has hit **OR** because cache has missed and we processed this situation and *successfully* `found` the requested? - Was it `success = false` because cache has missed **OR** because cache has missed and we processed this situation and *unsuccessfully* `found` the requested? These two points were in my mind... P.S. Okay. Agreed. The same can bee applied for `found` label

Let's look at the other side. This is a metric for the cache. I think it's not particularly important here whether we found the object in the end or not. So found is morph client metric, not morph cache.
@fyrchik @aarifullin what do you think?

Let's look at the other side. This is a metric for the cache. I think it's not particularly important here whether we found the object in the end or not. So `found` is `morph client` metric, not `morph cache`. @fyrchik @aarifullin what do you think?

From my POV - it's better to keep morph cache because we count hits/misses and the duration of the miss processing

From my POV - it's better to keep `morph cache` because we count hits/misses and the duration of the miss processing
}()
if gKey := c.cache.groupKey(); gKey != nil { if gKey := c.cache.groupKey(); gKey != nil {
success = true
return gKey, nil return gKey, nil
} }
@ -251,5 +267,7 @@ func (c *Client) contractGroupKey() (*keys.PublicKey, error) {
} }
c.cache.setGroupKey(pub) c.cache.setGroupKey(pub)
success = true
return pub, nil return pub, nil
} }

View file

@ -8,6 +8,7 @@ import (
"math" "math"
"math/big" "math/big"
"strings" "strings"
"time"
"git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs" "git.frostfs.info/TrueCloudLab/frostfs-node/internal/logs"
"git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/rand" "git.frostfs.info/TrueCloudLab/frostfs-node/pkg/util/rand"
@ -789,7 +790,14 @@ func (c *Client) calculateNonceAndVUB(hash util.Uint256, roundBlockHeight bool)
} }
func (c *Client) getTransactionHeight(h util.Uint256) (uint32, error) { func (c *Client) getTransactionHeight(h util.Uint256) (uint32, error) {
success := false
startedAt := time.Now()
defer func() {
c.cache.metrics.AddTxHeightDuration(h, success, time.Since(startedAt))
}()
if rh, ok := c.cache.txHeights.Get(h); ok { if rh, ok := c.cache.txHeights.Get(h); ok {
success = true
return rh, nil return rh, nil
} }
height, err := c.client.GetTransactionHeight(h) height, err := c.client.GetTransactionHeight(h)
@ -797,5 +805,6 @@ func (c *Client) getTransactionHeight(h util.Uint256) (uint32, error) {
return 0, err return 0, err
} }
c.cache.txHeights.Add(h, height) c.cache.txHeights.Add(h, height)
success = true
return height, nil return height, nil
} }

View file

@ -2,12 +2,6 @@ package tree
import "time" import "time"
type MetricsRegister interface {
AddReplicateTaskDuration(time.Duration, bool)
AddReplicateWaitDuration(time.Duration, bool)
AddSyncDuration(time.Duration, bool)
}
type defaultMetricsRegister struct{} type defaultMetricsRegister struct{}
func (defaultMetricsRegister) AddReplicateTaskDuration(time.Duration, bool) {} func (defaultMetricsRegister) AddReplicateTaskDuration(time.Duration, bool) {}