From ee49355bb7274a463c2f5b0e76fa44220fe061ce Mon Sep 17 00:00:00 2001 From: Denis Kirillov Date: Thu, 28 Jul 2022 16:44:58 +0300 Subject: [PATCH] [#179] Expose pool metrics Signed-off-by: Denis Kirillov --- app.go | 2 +- metrics/metrics.go | 161 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 160 insertions(+), 3 deletions(-) diff --git a/app.go b/app.go index bd9dd1d..56cf01a 100644 --- a/app.go +++ b/app.go @@ -165,7 +165,7 @@ func newApp(ctx context.Context, opt ...Option) App { } if a.cfg.GetBool(cfgPrometheusEnabled) { - a.metrics = metrics.NewGateMetrics() + a.metrics = metrics.NewGateMetrics(a.pool) } return a diff --git a/metrics/metrics.go b/metrics/metrics.go index 480cd81..9c79189 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -3,6 +3,7 @@ package metrics import ( "net/http" + "github.com/nspcc-dev/neofs-sdk-go/pool" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" "go.uber.org/zap" @@ -11,23 +12,54 @@ import ( const ( namespace = "neofs_http_gw" stateSubsystem = "state" + poolSubsystem = "pool" + + methodGetBalance = "get_balance" + methodPutContainer = "put_container" + methodGetContainer = "get_container" + methodListContainer = "list_container" + methodDeleteContainer = "delete_container" + methodGetContainerEacl = "get_container_eacl" + methodSetContainerEacl = "set_container_eacl" + methodEndpointInfo = "endpoint_info" + methodNetworkInfo = "network_info" + methodPutObject = "put_object" + methodDeleteObject = "delete_object" + methodGetObject = "get_object" + methodHeadObject = "head_object" + methodRangeObject = "range_object" + methodCreateSession = "create_session" ) type GateMetrics struct { stateMetrics + poolMetricsCollector } type stateMetrics struct { healthCheck prometheus.Gauge } +type poolMetricsCollector struct { + pool *pool.Pool + overallErrors prometheus.Counter + overallNodeErrors *prometheus.CounterVec + overallNodeRequests *prometheus.CounterVec + currentErrors *prometheus.GaugeVec + requestDuration *prometheus.GaugeVec +} + // NewGateMetrics creates new metrics for http gate. -func NewGateMetrics() *GateMetrics { +func NewGateMetrics(p *pool.Pool) *GateMetrics { stateMetric := newStateMetrics() stateMetric.register() + poolMetric := newPoolMetricsCollector(p) + poolMetric.register() + return &GateMetrics{ - stateMetrics: *stateMetric, + stateMetrics: *stateMetric, + poolMetricsCollector: *poolMetric, } } @@ -50,6 +82,131 @@ func (m stateMetrics) SetHealth(s int32) { m.healthCheck.Set(float64(s)) } +func newPoolMetricsCollector(p *pool.Pool) *poolMetricsCollector { + overallErrors := prometheus.NewCounter( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: poolSubsystem, + Name: "overall_errors", + Help: "Total number of errors in pool", + }, + ) + + overallNodeErrors := prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: poolSubsystem, + Name: "overall_node_errors", + Help: "Total number of errors for connection in pool", + }, + []string{ + "node", + }, + ) + + overallNodeRequests := prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: poolSubsystem, + Name: "overall_node_requests", + Help: "Total number of requests to specific node in pool", + }, + []string{ + "node", + }, + ) + + currentErrors := prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: poolSubsystem, + Name: "current_errors", + Help: "Number of errors on current connections that will be reset after the threshold", + }, + []string{ + "node", + }, + ) + + requestsDuration := prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: poolSubsystem, + Name: "avg_request_duration", + Help: "Average request duration (in milliseconds) for specific method on node in pool", + }, + []string{ + "node", + "method", + }, + ) + + return &poolMetricsCollector{ + pool: p, + overallErrors: overallErrors, + overallNodeErrors: overallNodeErrors, + overallNodeRequests: overallNodeRequests, + currentErrors: currentErrors, + requestDuration: requestsDuration, + } +} + +func (m *poolMetricsCollector) Collect(ch chan<- prometheus.Metric) { + m.updateStatistic() + m.overallErrors.Collect(ch) + m.overallNodeErrors.Collect(ch) + m.overallNodeRequests.Collect(ch) + m.currentErrors.Collect(ch) + m.requestDuration.Collect(ch) +} + +func (m poolMetricsCollector) Describe(descs chan<- *prometheus.Desc) { + m.overallErrors.Describe(descs) + m.overallNodeErrors.Describe(descs) + m.overallNodeRequests.Describe(descs) + m.currentErrors.Describe(descs) + m.requestDuration.Describe(descs) +} + +func (m *poolMetricsCollector) register() { + prometheus.MustRegister(m) +} + +func (m *poolMetricsCollector) updateStatistic() { + stat := m.pool.Statistic() + + m.currentErrors.Reset() + m.requestDuration.Reset() + + for _, node := range stat.Nodes() { + m.overallNodeErrors.WithLabelValues(node.Address()).Add(float64(node.OverallErrors())) + m.overallNodeRequests.WithLabelValues(node.Address()).Add(float64(node.Requests())) + + m.currentErrors.WithLabelValues(node.Address()).Set(float64(node.CurrentErrors())) + m.updateRequestsDuration(node) + } + + m.overallErrors.Add(float64(stat.OverallErrors())) +} + +func (m *poolMetricsCollector) updateRequestsDuration(node pool.NodeStatistic) { + m.requestDuration.WithLabelValues(node.Address(), methodGetBalance).Set(float64(node.AverageGetBalance().Milliseconds())) + m.requestDuration.WithLabelValues(node.Address(), methodPutContainer).Set(float64(node.AveragePutContainer().Milliseconds())) + m.requestDuration.WithLabelValues(node.Address(), methodGetContainer).Set(float64(node.AverageGetContainer().Milliseconds())) + m.requestDuration.WithLabelValues(node.Address(), methodListContainer).Set(float64(node.AverageListContainer().Milliseconds())) + m.requestDuration.WithLabelValues(node.Address(), methodDeleteContainer).Set(float64(node.AverageDeleteContainer().Milliseconds())) + m.requestDuration.WithLabelValues(node.Address(), methodGetContainerEacl).Set(float64(node.AverageGetContainerEACL().Milliseconds())) + m.requestDuration.WithLabelValues(node.Address(), methodSetContainerEacl).Set(float64(node.AverageSetContainerEACL().Milliseconds())) + m.requestDuration.WithLabelValues(node.Address(), methodEndpointInfo).Set(float64(node.AverageEndpointInfo().Milliseconds())) + m.requestDuration.WithLabelValues(node.Address(), methodNetworkInfo).Set(float64(node.AverageNetworkInfo().Milliseconds())) + m.requestDuration.WithLabelValues(node.Address(), methodPutObject).Set(float64(node.AveragePutObject().Milliseconds())) + m.requestDuration.WithLabelValues(node.Address(), methodDeleteObject).Set(float64(node.AverageDeleteObject().Milliseconds())) + m.requestDuration.WithLabelValues(node.Address(), methodGetObject).Set(float64(node.AverageGetObject().Milliseconds())) + m.requestDuration.WithLabelValues(node.Address(), methodHeadObject).Set(float64(node.AverageHeadObject().Milliseconds())) + m.requestDuration.WithLabelValues(node.Address(), methodRangeObject).Set(float64(node.AverageRangeObject().Milliseconds())) + m.requestDuration.WithLabelValues(node.Address(), methodCreateSession).Set(float64(node.AverageCreateSession().Milliseconds())) +} + // NewPrometheusService creates a new service for gathering prometheus metrics. func NewPrometheusService(log *zap.Logger, cfg Config) *Service { if log == nil {