[#29] metrics: Support dump descriptions

Signed-off-by: Denis Kirillov <d.kirillov@yadro.com>
This commit is contained in:
Denis Kirillov 2023-04-07 18:14:31 +03:00
parent 959213520e
commit cc37c34396
5 changed files with 211 additions and 69 deletions

View file

@ -9,6 +9,8 @@ BUILD ?= $(shell date -u --iso=seconds)
HUB_IMAGE ?= truecloudlab/frostfs-http-gw
HUB_TAG ?= "$(shell echo ${VERSION} | sed 's/^v//')"
METRICS_DUMP_OUT ?= ./metrics-dump.json
# List of binaries to build. For now just one.
BINDIR = bin
DIRS = $(BINDIR)
@ -143,4 +145,10 @@ debpackage:
debclean:
dh clean
# Dump metrics (use METRICS_DUMP_OUT variable to override default out file './metrics-dump.json')
.PHONY: dump-metrics
dump-metrics:
@go test ./metrics -run TestDescribeAll --tags=dump_metrics --out=$(abspath $(METRICS_DUMP_OUT))
include help.mk

12
app.go
View file

@ -243,6 +243,17 @@ func (m *gateMetrics) SetHealth(status metrics.HealthStatus) {
m.provider.SetHealth(status)
}
func (m *gateMetrics) SetVersion(ver string) {
m.mu.RLock()
if !m.enabled {
m.mu.RUnlock()
return
}
m.mu.RUnlock()
m.provider.SetVersion(ver)
}
func (m *gateMetrics) Shutdown() {
m.mu.Lock()
if m.enabled {
@ -325,6 +336,7 @@ func getKeyFromWallet(w *wallet.Wallet, addrStr string, password *string) (*ecds
func (a *app) Wait() {
a.log.Info("starting application", zap.String("app_name", "frostfs-http-gw"), zap.String("version", Version))
a.metrics.SetVersion(Version)
a.setHealthStatus()
<-a.webDone // wait for web-server to be stopped

136
metrics/desc.go Normal file
View file

@ -0,0 +1,136 @@
package metrics
import (
"encoding/json"
"github.com/prometheus/client_golang/prometheus"
)
var appMetricsDesc = map[string]map[string]Description{
poolSubsystem: {
overallErrorsMetric: Description{
Namespace: namespace,
Subsystem: poolSubsystem,
Name: overallErrorsMetric,
Help: "Total number of errors in pool",
},
overallNodeErrorsMetric: Description{
Namespace: namespace,
Subsystem: poolSubsystem,
Name: overallNodeErrorsMetric,
Help: "Total number of errors for connection in pool",
VariableLabels: []string{"node"},
},
overallNodeRequestsMetric: Description{
Namespace: namespace,
Subsystem: poolSubsystem,
Name: overallNodeRequestsMetric,
Help: "Total number of requests to specific node in pool",
VariableLabels: []string{"node"},
},
currentErrorMetric: Description{
Namespace: namespace,
Subsystem: poolSubsystem,
Name: currentErrorMetric,
Help: "Number of errors on current connections that will be reset after the threshold",
VariableLabels: []string{"node"},
},
avgRequestDurationMetric: Description{
Namespace: namespace,
Subsystem: poolSubsystem,
Name: avgRequestDurationMetric,
Help: "Average request duration (in milliseconds) for specific method on node in pool",
VariableLabels: []string{"node", "method"},
},
},
stateSubsystem: {
healthMetric: Description{
Namespace: namespace,
Subsystem: stateSubsystem,
Name: healthMetric,
Help: "Current HTTP gateway state",
},
versionInfoMetric: Description{
Namespace: namespace,
Subsystem: stateSubsystem,
Name: versionInfoMetric,
Help: "Version of current FrostFS HTTP Gate instance",
VariableLabels: []string{"version"},
},
},
}
type Description struct {
Namespace string
Subsystem string
Name string
Help string
ConstantLabels []KeyValue
VariableLabels []string
}
type KeyValue struct {
Key string `json:"key"`
Value string `json:"value"`
}
func (d *Description) MarshalJSON() ([]byte, error) {
return json.Marshal(&struct {
FQName string `json:"name"`
Help string `json:"help"`
ConstantLabels []KeyValue `json:"constant_labels"`
VariableLabels []string `json:"variable_labels"`
}{
FQName: d.BuildFQName(),
Help: d.Help,
ConstantLabels: d.ConstantLabels,
VariableLabels: d.VariableLabels,
})
}
func (d *Description) BuildFQName() string {
return prometheus.BuildFQName(d.Namespace, d.Subsystem, d.Name)
}
func (d *Description) ConstLabelsMap() map[string]string {
constsLabels := make(map[string]string, len(d.ConstantLabels))
for _, kv := range d.ConstantLabels {
constsLabels[kv.Key] = kv.Value
}
return constsLabels
}
// DescribeAll returns descriptions for metrics.
func DescribeAll() []Description {
var list []Description
for _, m := range appMetricsDesc {
for _, description := range m {
list = append(list, description)
}
}
return list
}
func newOpts(description Description) prometheus.Opts {
return prometheus.Opts{
Namespace: description.Namespace,
Subsystem: description.Subsystem,
Name: description.Name,
Help: description.Help,
ConstLabels: description.ConstLabelsMap(),
}
}
func newGauge(description Description) prometheus.Gauge {
return prometheus.NewGauge(
prometheus.GaugeOpts(newOpts(description)),
)
}
func newGaugeVec(description Description) *prometheus.GaugeVec {
return prometheus.NewGaugeVec(
prometheus.GaugeOpts(newOpts(description)),
description.VariableLabels,
)
}

26
metrics/desc_test.go Normal file
View file

@ -0,0 +1,26 @@
//go:build dump_metrics
package metrics
import (
"encoding/json"
"flag"
"os"
"testing"
"github.com/stretchr/testify/require"
)
var metricsPath = flag.String("out", "", "File to export http gateway metrics to.")
func TestDescribeAll(t *testing.T) {
flag.Parse()
require.NotEmpty(t, metricsPath, "flag 'out' must be provided to dump metrics description")
data, err := json.Marshal(DescribeAll())
require.NoError(t, err)
err = os.WriteFile(*metricsPath, data, 0644)
require.NoError(t, err)
}

View file

@ -13,7 +13,22 @@ const (
namespace = "frostfs_http_gw"
stateSubsystem = "state"
poolSubsystem = "pool"
)
const (
healthMetric = "health"
versionInfoMetric = "version_info"
)
const (
overallErrorsMetric = "overall_errors"
overallNodeErrorsMetric = "overall_node_errors"
overallNodeRequestsMetric = "overall_node_requests"
currentErrorMetric = "current_errors"
avgRequestDurationMetric = "avg_request_duration"
)
const (
methodGetBalance = "get_balance"
methodPutContainer = "put_container"
methodGetContainer = "get_container"
@ -48,6 +63,7 @@ type GateMetrics struct {
type stateMetrics struct {
healthCheck prometheus.Gauge
versionInfo *prometheus.GaugeVec
}
type poolMetricsCollector struct {
@ -80,93 +96,37 @@ func (g *GateMetrics) Unregister() {
func newStateMetrics() *stateMetrics {
return &stateMetrics{
healthCheck: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: stateSubsystem,
Name: "health",
Help: "Current HTTP gateway state",
}),
healthCheck: newGauge(appMetricsDesc[stateSubsystem][healthMetric]),
versionInfo: newGaugeVec(appMetricsDesc[stateSubsystem][versionInfoMetric]),
}
}
func (m stateMetrics) register() {
prometheus.MustRegister(m.healthCheck)
prometheus.MustRegister(m.versionInfo)
}
func (m stateMetrics) unregister() {
prometheus.Unregister(m.healthCheck)
prometheus.Unregister(m.versionInfo)
}
func (m stateMetrics) SetHealth(s HealthStatus) {
m.healthCheck.Set(float64(s))
}
func (m stateMetrics) SetVersion(ver string) {
m.versionInfo.WithLabelValues(ver).Set(1)
}
func newPoolMetricsCollector(p *pool.Pool) *poolMetricsCollector {
overallErrors := prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: poolSubsystem,
Name: "overall_errors",
Help: "Total number of errors in pool",
},
)
overallNodeErrors := prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: poolSubsystem,
Name: "overall_node_errors",
Help: "Total number of errors for connection in pool",
},
[]string{
"node",
},
)
overallNodeRequests := prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: poolSubsystem,
Name: "overall_node_requests",
Help: "Total number of requests to specific node in pool",
},
[]string{
"node",
},
)
currentErrors := prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: poolSubsystem,
Name: "current_errors",
Help: "Number of errors on current connections that will be reset after the threshold",
},
[]string{
"node",
},
)
requestsDuration := prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Subsystem: poolSubsystem,
Name: "avg_request_duration",
Help: "Average request duration (in milliseconds) for specific method on node in pool",
},
[]string{
"node",
"method",
},
)
return &poolMetricsCollector{
pool: p,
overallErrors: overallErrors,
overallNodeErrors: overallNodeErrors,
overallNodeRequests: overallNodeRequests,
currentErrors: currentErrors,
requestDuration: requestsDuration,
overallErrors: newGauge(appMetricsDesc[poolSubsystem][overallErrorsMetric]),
overallNodeErrors: newGaugeVec(appMetricsDesc[poolSubsystem][overallNodeErrorsMetric]),
overallNodeRequests: newGaugeVec(appMetricsDesc[poolSubsystem][overallNodeRequestsMetric]),
currentErrors: newGaugeVec(appMetricsDesc[poolSubsystem][currentErrorMetric]),
requestDuration: newGaugeVec(appMetricsDesc[poolSubsystem][avgRequestDurationMetric]),
}
}