Cleanup metrics (#3776)

Cleanup a variety of metric issues. * Eliminate department of redundancy "count_total" naming. * Use the plural of the unit when appropriate. (ex, "requests") * Remove label names from metric names where appropriate. (ex, "rcode") * Simplify request metrics by consolidating type label in to the base request counter. * Re-generate man pages. Signed-off-by: Ben Kochie <superq@gmail.com> Co-authored-by: Ben Kochie <superq@gmail.com>
2020-03-26 09:17:33 +01:00 · 2020-03-26 09:17:33 +01:00 · 19cfa2960c
commit 19cfa2960c
parent eb23cce1a7
16 changed files with 39 additions and 50 deletions
--- a/plugin/acl/README.md
+++ b/plugin/acl/README.md
@ -73,8 +73,8 @@ example.org {

 If monitoring is enabled (via the _prometheus_ plugin) then the following metrics are exported:

- `coredns_request_block_count_total{server, zone}` - counter of DNS requests being blocked.
+- `coredns_dns_blocked_requests_total{server, zone}` - counter of DNS requests being blocked.

- `coredns_request_allow_count_total{server}` - counter of DNS requests being allowed.
+- `coredns_dns_allowed_requests_total{server}` - counter of DNS requests being allowed.

 The `server` and `zone` labels are explained in the _metrics_ plugin documentation.
--- a/plugin/acl/metrics.go
+++ b/plugin/acl/metrics.go
@ -11,14 +11,14 @@ var (
 	RequestBlockCount = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: "dns",
-		Name:      "request_block_count_total",
+		Name:      "acl_blocked_requests_total",
 		Help:      "Counter of DNS requests being blocked.",
 	}, []string{"server", "zone"})
 	// RequestAllowCount is the number of DNS requests being Allowed.
 	RequestAllowCount = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: "dns",
-		Name:      "request_allow_count_total",
+		Name:      "acl_allowed_requests_total",
 		Help:      "Counter of DNS requests being allowed.",
 	}, []string{"server"})
 )
--- a/plugin/autopath/README.md
+++ b/plugin/autopath/README.md
@ -31,7 +31,7 @@ If a plugin implements the `AutoPather` interface then it can be used.

 If monitoring is enabled (via the *prometheus* plugin) then the following metric is exported:

-* `coredns_autopath_success_count_total{server}` - counter of successfully autopath-ed queries.
+* `coredns_autopath_success_total{server}` - counter of successfully autopath-ed queries.

 The `server` label is explained in the *metrics* plugin documentation.

--- a/plugin/autopath/metrics.go
+++ b/plugin/autopath/metrics.go
@ -10,7 +10,7 @@ var (
 	autoPathCount = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: "autopath",
-		Name:      "success_count_total",
+		Name:      "success_total",
 		Help:      "Counter of requests that did autopath.",
 	}, []string{"server"})
 )
--- a/plugin/forward/README.md
+++ b/plugin/forward/README.md
@ -108,12 +108,12 @@ On each endpoint, the timeouts of the communication are set by default and autom
 If monitoring is enabled (via the *prometheus* plugin) then the following metric are exported:

 * `coredns_forward_request_duration_seconds{to}` - duration per upstream interaction.
-* `coredns_forward_request_count_total{to}` - query count per upstream.
-* `coredns_forward_response_rcode_count_total{to, rcode}` - count of RCODEs per upstream.
-* `coredns_forward_healthcheck_failure_count_total{to}` - number of failed health checks per upstream.
-* `coredns_forward_healthcheck_broken_count_total{}` - counter of when all upstreams are unhealthy,
+* `coredns_forward_requests_total{to}` - query count per upstream.
+* `coredns_forward_responses_total{to, rcode}` - count of RCODEs per upstream.
+* `coredns_forward_healthcheck_failures_total{to}` - number of failed health checks per upstream.
+* `coredns_forward_healthcheck_broken_total{}` - counter of when all upstreams are unhealthy,
  and we are randomly (this always uses the `random` policy) spraying to an upstream.
-* `max_concurrent_reject_count_total{}` - counter of the number of queries rejected because the
+* `max_concurrent_rejects_total{}` - counter of the number of queries rejected because the
  number of concurrent queries were at maximum.
 Where `to` is one of the upstream servers (**TO** from the config), `rcode` is the returned RCODE
 from the upstream.
--- a/plugin/forward/metrics.go
+++ b/plugin/forward/metrics.go
@ -11,13 +11,13 @@ var (
 	RequestCount = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: "forward",
-		Name:      "request_count_total",
+		Name:      "requests_total",
 		Help:      "Counter of requests made per upstream.",
 	}, []string{"to"})
 	RcodeCount = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: "forward",
-		Name:      "response_rcode_count_total",
+		Name:      "responses_total",
 		Help:      "Counter of requests made per upstream.",
 	}, []string{"rcode", "to"})
 	RequestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
@ -30,13 +30,13 @@ var (
 	HealthcheckFailureCount = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: "forward",
-		Name:      "healthcheck_failure_count_total",
+		Name:      "healthcheck_failures_total",
 		Help:      "Counter of the number of failed healthchecks.",
 	}, []string{"to"})
 	HealthcheckBrokenCount = prometheus.NewCounter(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: "forward",
-		Name:      "healthcheck_broken_count_total",
+		Name:      "healthcheck_broken_total",
 		Help:      "Counter of the number of complete failures of the healthchecks.",
 	})
 	SocketGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
@ -48,7 +48,7 @@ var (
 	MaxConcurrentRejectCount = prometheus.NewCounter(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: "forward",
-		Name:      "max_concurrent_reject_count_total",
+		Name:      "max_concurrent_rejects_total",
 		Help:      "Counter of the number of queries rejected because the concurrent queries were at maximum.",
 	})
 )
--- a/plugin/grpc/README.md
+++ b/plugin/grpc/README.md
@ -63,8 +63,8 @@ Also note the TLS config is "global" for the whole grpc proxy if you need a diff
 If monitoring is enabled (via the *prometheus* plugin) then the following metric are exported:

 * `coredns_grpc_request_duration_seconds{to}` - duration per upstream interaction.
-* `coredns_grpc_request_count_total{to}` - query count per upstream.
-* `coredns_grpc_response_rcode_count_total{to, rcode}` - count of RCODEs per upstream.
+* `coredns_grpc_requests_total{to}` - query count per upstream.
+* `coredns_grpc_responses_total{to, rcode}` - count of RCODEs per upstream.
  and we are randomly (this always uses the `random` policy) spraying to an upstream.

 ## Examples
--- a/plugin/grpc/metrics.go
+++ b/plugin/grpc/metrics.go
@ -11,13 +11,13 @@ var (
 	RequestCount = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: "grpc",
-		Name:      "request_count_total",
+		Name:      "requests_total",
 		Help:      "Counter of requests made per upstream.",
 	}, []string{"to"})
 	RcodeCount = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: "grpc",
-		Name:      "response_rcode_count_total",
+		Name:      "responses_total",
 		Help:      "Counter of requests made per upstream.",
 	}, []string{"rcode", "to"})
 	RequestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
--- a/plugin/metrics/README.md
+++ b/plugin/metrics/README.md
@ -11,14 +11,13 @@ The default location for the metrics is `localhost:9153`. The metrics path is fi
 The following metrics are exported:

 * `coredns_build_info{version, revision, goversion}` - info about CoreDNS itself.
-* `coredns_panic_count_total{}` - total number of panics.
-* `coredns_dns_request_count_total{server, zone, proto, family}` - total query count.
+* `coredns_panics_total{}` - total number of panics.
+* `coredns_dns_requests_total{server, zone, proto, family, type}` - total query count.
 * `coredns_dns_request_duration_seconds{server, zone, type}` - duration to process each query.
 * `coredns_dns_request_size_bytes{server, zone, proto}` - size of the request in bytes.
-* `coredns_dns_request_do_count_total{server, zone}` -  queries that have the DO bit set
-* `coredns_dns_request_type_count_total{server, zone, type}` - counter of queries per zone and type.
+* `coredns_dns_do_requests_total{server, zone}` -  queries that have the DO bit set
 * `coredns_dns_response_size_bytes{server, zone, proto}` - response size in bytes.
-* `coredns_dns_response_rcode_count_total{server, zone, rcode}` - response per zone and rcode.
+* `coredns_dns_responses_total{server, zone, rcode}` - response per zone and rcode.
 * `coredns_plugin_enabled{server, zone, name}` - indicates whether a plugin is enabled on per server and zone basis.

 Each counter has a label `zone` which is the zonename used for the request/response.
@ -33,7 +32,6 @@ Extra labels used are:
 * `type` which holds the query type. It holds most common types (A, AAAA, MX, SOA, CNAME, PTR, TXT,
  NS, SRV, DS, DNSKEY, RRSIG, NSEC, NSEC3, IXFR, AXFR and ANY) and "other" which lumps together all
  other types.
-* The `response_rcode_count_total` has an extra label `rcode` which holds the rcode of the response.

 If monitoring is enabled, queries that do not enter the plugin chain are exported under the fake
 name "dropped" (without a closing dot - this is never a valid domain name).
--- a/plugin/metrics/metrics.go
+++ b/plugin/metrics/metrics.go
@ -51,7 +51,6 @@ func New(addr string) *Metrics {
 	met.MustRegister(vars.RequestDuration)
 	met.MustRegister(vars.RequestSize)
 	met.MustRegister(vars.RequestDo)
-	met.MustRegister(vars.RequestType)
 	met.MustRegister(vars.ResponseSize)
 	met.MustRegister(vars.ResponseRcode)
 	met.MustRegister(vars.PluginEnabled)
--- a/plugin/metrics/metrics_test.go
+++ b/plugin/metrics/metrics_test.go
@ -31,25 +31,25 @@ func TestMetrics(t *testing.T) {
 		{
 			next:          test.NextHandler(dns.RcodeSuccess, nil),
 			qname:         "example.org",
-			metric:        "coredns_dns_request_count_total",
+			metric:        "coredns_dns_requests_total",
 			expectedValue: "1",
 		},
 		{
 			next:          test.NextHandler(dns.RcodeSuccess, nil),
 			qname:         "example.org",
-			metric:        "coredns_dns_request_count_total",
+			metric:        "coredns_dns_requests_total",
 			expectedValue: "2",
 		},
 		{
 			next:          test.NextHandler(dns.RcodeSuccess, nil),
 			qname:         "example.org",
-			metric:        "coredns_dns_request_type_count_total",
+			metric:        "coredns_dns_requests_total",
 			expectedValue: "3",
 		},
 		{
 			next:          test.NextHandler(dns.RcodeSuccess, nil),
 			qname:         "example.org",
-			metric:        "coredns_dns_response_rcode_count_total",
+			metric:        "coredns_dns_responses_total",
 			expectedValue: "4",
 		},
 	}
--- a/plugin/metrics/vars/report.go
+++ b/plugin/metrics/vars/report.go
@ -20,17 +20,16 @@ func Report(server string, req request.Request, zone, rcode string, size int, st
 	}

 	typ := req.QType()
-	RequestCount.WithLabelValues(server, zone, net, fam).Inc()

 	if req.Do() {
 		RequestDo.WithLabelValues(server, zone).Inc()
 	}

 	if _, known := monitorType[typ]; known {
-		RequestType.WithLabelValues(server, zone, dns.Type(typ).String()).Inc()
+		RequestCount.WithLabelValues(server, zone, net, fam, dns.Type(typ).String()).Inc()
 		RequestDuration.WithLabelValues(server, zone, dns.Type(typ).String()).Observe(time.Since(start).Seconds())
 	} else {
-		RequestType.WithLabelValues(server, zone, other).Inc()
+		RequestCount.WithLabelValues(server, zone, net, fam, other).Inc()
 		RequestDuration.WithLabelValues(server, zone, other).Observe(time.Since(start).Seconds())
 	}

--- a/plugin/metrics/vars/vars.go
+++ b/plugin/metrics/vars/vars.go
@ -11,9 +11,9 @@ var (
 	RequestCount = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: subsystem,
-		Name:      "request_count_total",
+		Name:      "requests_total",
 		Help:      "Counter of DNS requests made per zone, protocol and family.",
-	}, []string{"server", "zone", "proto", "family"})
+	}, []string{"server", "zone", "proto", "family", "type"})

 	RequestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
 		Namespace: plugin.Namespace,
@ -34,17 +34,10 @@ var (
 	RequestDo = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: subsystem,
-		Name:      "request_do_count_total",
+		Name:      "do_requests_total",
 		Help:      "Counter of DNS requests with DO bit set per zone.",
 	}, []string{"server", "zone"})

-	RequestType = prometheus.NewCounterVec(prometheus.CounterOpts{
-		Namespace: plugin.Namespace,
-		Subsystem: subsystem,
-		Name:      "request_type_count_total",
-		Help:      "Counter of DNS requests per type, per zone.",
-	}, []string{"server", "zone", "type"})
-
 	ResponseSize = prometheus.NewHistogramVec(prometheus.HistogramOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: subsystem,
@ -56,13 +49,13 @@ var (
 	ResponseRcode = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: subsystem,
-		Name:      "response_rcode_count_total",
+		Name:      "responses_total",
 		Help:      "Counter of response status codes.",
 	}, []string{"server", "zone", "rcode"})

 	Panic = prometheus.NewCounter(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
-		Name:      "panic_count_total",
+		Name:      "panics_total",
 		Help:      "A metrics that counts the number of panics.",
 	})

--- a/plugin/reload/README.md
+++ b/plugin/reload/README.md
@ -98,7 +98,7 @@ CoreDNS v1.7.0 and later does parse the Corefile and supports detecting changes

 If monitoring is enabled (via the *prometheus* plugin) then the following metric is exported:

-* `coredns_reload_failed_count_total{}` - counts the number of failed reload attempts.
+* `coredns_reload_failed_total{}` - counts the number of failed reload attempts.
 * `coredns_reload_version_info{hash, value}` - record the hash value during reload.

 Currently the type of `hash` is "md5", the `value` is the returned hash value.
--- a/plugin/reload/metrics.go
+++ b/plugin/reload/metrics.go
@ -11,7 +11,7 @@ var (
 	FailedCount = prometheus.NewCounter(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: "reload",
-		Name:      "failed_count_total",
+		Name:      "failed_total",
 		Help:      "Counter of the number of failed reload attempts.",
 	})

--- a/test/metrics_test.go
+++ b/test/metrics_test.go
@ -36,7 +36,7 @@ example.com:0 {
 }

 func TestMetricsRefused(t *testing.T) {
-	metricName := "coredns_dns_response_rcode_count_total"
+	metricName := "coredns_dns_responses_total"

 	corefile := `example.org:0 {
 	forward . 8.8.8.8:53
@ -110,7 +110,7 @@ func TestMetricsAuto(t *testing.T) {
 		t.Fatalf("Could not send message: %s", err)
 	}

-	metricName := "coredns_dns_request_count_total" //{zone, proto, family}
+	metricName := "coredns_dns_requests_total" // {zone, proto, family, type}

 	data := test.Scrape("http://" + metrics.ListenAddr + "/metrics")
 	// Get the value for the metrics where the one of the labels values matches "example.org."