From 19cfa2960cc415e0e38d36fc9d9325bd636c000e Mon Sep 17 00:00:00 2001
From: Miek Gieben <miek@miek.nl>
Date: Thu, 26 Mar 2020 09:17:33 +0100
Subject: [PATCH] Cleanup metrics (#3776)

Cleanup a variety of metric issues.
* Eliminate department of redundancy "count_total" naming.
* Use the plural of the unit when appropriate. (ex, "requests")
* Remove label names from metric names where appropriate. (ex, "rcode")
* Simplify request metrics by consolidating type label in to the base
request counter.
* Re-generate man pages.

Signed-off-by: Ben Kochie <superq@gmail.com>

Co-authored-by: Ben Kochie <superq@gmail.com>
---
 plugin/acl/README.md           |  4 ++--
 plugin/acl/metrics.go          |  4 ++--
 plugin/autopath/README.md      |  2 +-
 plugin/autopath/metrics.go     |  2 +-
 plugin/forward/README.md       | 10 +++++-----
 plugin/forward/metrics.go      | 10 +++++-----
 plugin/grpc/README.md          |  4 ++--
 plugin/grpc/metrics.go         |  4 ++--
 plugin/metrics/README.md       | 10 ++++------
 plugin/metrics/metrics.go      |  1 -
 plugin/metrics/metrics_test.go |  8 ++++----
 plugin/metrics/vars/report.go  |  5 ++---
 plugin/metrics/vars/vars.go    | 17 +++++------------
 plugin/reload/README.md        |  2 +-
 plugin/reload/metrics.go       |  2 +-
 test/metrics_test.go           |  4 ++--
 16 files changed, 39 insertions(+), 50 deletions(-)

diff --git a/plugin/acl/README.md b/plugin/acl/README.md
index f96fb12f2..fd08b406b 100644
--- a/plugin/acl/README.md
+++ b/plugin/acl/README.md
@@ -73,8 +73,8 @@ example.org {
 
 If monitoring is enabled (via the _prometheus_ plugin) then the following metrics are exported:
 
-- `coredns_request_block_count_total{server, zone}` - counter of DNS requests being blocked.
+- `coredns_dns_blocked_requests_total{server, zone}` - counter of DNS requests being blocked.
 
-- `coredns_request_allow_count_total{server}` - counter of DNS requests being allowed.
+- `coredns_dns_allowed_requests_total{server}` - counter of DNS requests being allowed.
 
 The `server` and `zone` labels are explained in the _metrics_ plugin documentation.
diff --git a/plugin/acl/metrics.go b/plugin/acl/metrics.go
index 442ea2374..1f235b85b 100644
--- a/plugin/acl/metrics.go
+++ b/plugin/acl/metrics.go
@@ -11,14 +11,14 @@ var (
 	RequestBlockCount = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: "dns",
-		Name:      "request_block_count_total",
+		Name:      "acl_blocked_requests_total",
 		Help:      "Counter of DNS requests being blocked.",
 	}, []string{"server", "zone"})
 	// RequestAllowCount is the number of DNS requests being Allowed.
 	RequestAllowCount = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: "dns",
-		Name:      "request_allow_count_total",
+		Name:      "acl_allowed_requests_total",
 		Help:      "Counter of DNS requests being allowed.",
 	}, []string{"server"})
 )
diff --git a/plugin/autopath/README.md b/plugin/autopath/README.md
index 52156ba5c..b1b4b165c 100644
--- a/plugin/autopath/README.md
+++ b/plugin/autopath/README.md
@@ -31,7 +31,7 @@ If a plugin implements the `AutoPather` interface then it can be used.
 
 If monitoring is enabled (via the *prometheus* plugin) then the following metric is exported:
 
-* `coredns_autopath_success_count_total{server}` - counter of successfully autopath-ed queries.
+* `coredns_autopath_success_total{server}` - counter of successfully autopath-ed queries.
 
 The `server` label is explained in the *metrics* plugin documentation.
 
diff --git a/plugin/autopath/metrics.go b/plugin/autopath/metrics.go
index ae25ab79e..cfb276274 100644
--- a/plugin/autopath/metrics.go
+++ b/plugin/autopath/metrics.go
@@ -10,7 +10,7 @@ var (
 	autoPathCount = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: "autopath",
-		Name:      "success_count_total",
+		Name:      "success_total",
 		Help:      "Counter of requests that did autopath.",
 	}, []string{"server"})
 )
diff --git a/plugin/forward/README.md b/plugin/forward/README.md
index 92b6bd168..be9d29ecf 100644
--- a/plugin/forward/README.md
+++ b/plugin/forward/README.md
@@ -108,12 +108,12 @@ On each endpoint, the timeouts of the communication are set by default and autom
 If monitoring is enabled (via the *prometheus* plugin) then the following metric are exported:
 
 * `coredns_forward_request_duration_seconds{to}` - duration per upstream interaction.
-* `coredns_forward_request_count_total{to}` - query count per upstream.
-* `coredns_forward_response_rcode_count_total{to, rcode}` - count of RCODEs per upstream.
-* `coredns_forward_healthcheck_failure_count_total{to}` - number of failed health checks per upstream.
-* `coredns_forward_healthcheck_broken_count_total{}` - counter of when all upstreams are unhealthy,
+* `coredns_forward_requests_total{to}` - query count per upstream.
+* `coredns_forward_responses_total{to, rcode}` - count of RCODEs per upstream.
+* `coredns_forward_healthcheck_failures_total{to}` - number of failed health checks per upstream.
+* `coredns_forward_healthcheck_broken_total{}` - counter of when all upstreams are unhealthy,
   and we are randomly (this always uses the `random` policy) spraying to an upstream.
-* `max_concurrent_reject_count_total{}` - counter of the number of queries rejected because the
+* `max_concurrent_rejects_total{}` - counter of the number of queries rejected because the
   number of concurrent queries were at maximum.
 Where `to` is one of the upstream servers (**TO** from the config), `rcode` is the returned RCODE
 from the upstream.
diff --git a/plugin/forward/metrics.go b/plugin/forward/metrics.go
index d92028d24..4122bc210 100644
--- a/plugin/forward/metrics.go
+++ b/plugin/forward/metrics.go
@@ -11,13 +11,13 @@ var (
 	RequestCount = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: "forward",
-		Name:      "request_count_total",
+		Name:      "requests_total",
 		Help:      "Counter of requests made per upstream.",
 	}, []string{"to"})
 	RcodeCount = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: "forward",
-		Name:      "response_rcode_count_total",
+		Name:      "responses_total",
 		Help:      "Counter of requests made per upstream.",
 	}, []string{"rcode", "to"})
 	RequestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
@@ -30,13 +30,13 @@ var (
 	HealthcheckFailureCount = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: "forward",
-		Name:      "healthcheck_failure_count_total",
+		Name:      "healthcheck_failures_total",
 		Help:      "Counter of the number of failed healthchecks.",
 	}, []string{"to"})
 	HealthcheckBrokenCount = prometheus.NewCounter(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: "forward",
-		Name:      "healthcheck_broken_count_total",
+		Name:      "healthcheck_broken_total",
 		Help:      "Counter of the number of complete failures of the healthchecks.",
 	})
 	SocketGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
@@ -48,7 +48,7 @@ var (
 	MaxConcurrentRejectCount = prometheus.NewCounter(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: "forward",
-		Name:      "max_concurrent_reject_count_total",
+		Name:      "max_concurrent_rejects_total",
 		Help:      "Counter of the number of queries rejected because the concurrent queries were at maximum.",
 	})
 )
diff --git a/plugin/grpc/README.md b/plugin/grpc/README.md
index 36314232a..5e6148da9 100644
--- a/plugin/grpc/README.md
+++ b/plugin/grpc/README.md
@@ -63,8 +63,8 @@ Also note the TLS config is "global" for the whole grpc proxy if you need a diff
 If monitoring is enabled (via the *prometheus* plugin) then the following metric are exported:
 
 * `coredns_grpc_request_duration_seconds{to}` - duration per upstream interaction.
-* `coredns_grpc_request_count_total{to}` - query count per upstream.
-* `coredns_grpc_response_rcode_count_total{to, rcode}` - count of RCODEs per upstream.
+* `coredns_grpc_requests_total{to}` - query count per upstream.
+* `coredns_grpc_responses_total{to, rcode}` - count of RCODEs per upstream.
   and we are randomly (this always uses the `random` policy) spraying to an upstream.
 
 ## Examples
diff --git a/plugin/grpc/metrics.go b/plugin/grpc/metrics.go
index 76b186bee..e4a3ce2f5 100644
--- a/plugin/grpc/metrics.go
+++ b/plugin/grpc/metrics.go
@@ -11,13 +11,13 @@ var (
 	RequestCount = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: "grpc",
-		Name:      "request_count_total",
+		Name:      "requests_total",
 		Help:      "Counter of requests made per upstream.",
 	}, []string{"to"})
 	RcodeCount = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: "grpc",
-		Name:      "response_rcode_count_total",
+		Name:      "responses_total",
 		Help:      "Counter of requests made per upstream.",
 	}, []string{"rcode", "to"})
 	RequestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
diff --git a/plugin/metrics/README.md b/plugin/metrics/README.md
index 3379919c9..ab773f65b 100644
--- a/plugin/metrics/README.md
+++ b/plugin/metrics/README.md
@@ -11,14 +11,13 @@ The default location for the metrics is `localhost:9153`. The metrics path is fi
 The following metrics are exported:
 
 * `coredns_build_info{version, revision, goversion}` - info about CoreDNS itself.
-* `coredns_panic_count_total{}` - total number of panics.
-* `coredns_dns_request_count_total{server, zone, proto, family}` - total query count.
+* `coredns_panics_total{}` - total number of panics.
+* `coredns_dns_requests_total{server, zone, proto, family, type}` - total query count.
 * `coredns_dns_request_duration_seconds{server, zone, type}` - duration to process each query.
 * `coredns_dns_request_size_bytes{server, zone, proto}` - size of the request in bytes.
-* `coredns_dns_request_do_count_total{server, zone}` -  queries that have the DO bit set
-* `coredns_dns_request_type_count_total{server, zone, type}` - counter of queries per zone and type.
+* `coredns_dns_do_requests_total{server, zone}` -  queries that have the DO bit set
 * `coredns_dns_response_size_bytes{server, zone, proto}` - response size in bytes.
-* `coredns_dns_response_rcode_count_total{server, zone, rcode}` - response per zone and rcode.
+* `coredns_dns_responses_total{server, zone, rcode}` - response per zone and rcode.
 * `coredns_plugin_enabled{server, zone, name}` - indicates whether a plugin is enabled on per server and zone basis.
 
 Each counter has a label `zone` which is the zonename used for the request/response.
@@ -33,7 +32,6 @@ Extra labels used are:
 * `type` which holds the query type. It holds most common types (A, AAAA, MX, SOA, CNAME, PTR, TXT,
   NS, SRV, DS, DNSKEY, RRSIG, NSEC, NSEC3, IXFR, AXFR and ANY) and "other" which lumps together all
   other types.
-* The `response_rcode_count_total` has an extra label `rcode` which holds the rcode of the response.
 
 If monitoring is enabled, queries that do not enter the plugin chain are exported under the fake
 name "dropped" (without a closing dot - this is never a valid domain name).
diff --git a/plugin/metrics/metrics.go b/plugin/metrics/metrics.go
index ddadd7aca..896e1bc2e 100644
--- a/plugin/metrics/metrics.go
+++ b/plugin/metrics/metrics.go
@@ -51,7 +51,6 @@ func New(addr string) *Metrics {
 	met.MustRegister(vars.RequestDuration)
 	met.MustRegister(vars.RequestSize)
 	met.MustRegister(vars.RequestDo)
-	met.MustRegister(vars.RequestType)
 	met.MustRegister(vars.ResponseSize)
 	met.MustRegister(vars.ResponseRcode)
 	met.MustRegister(vars.PluginEnabled)
diff --git a/plugin/metrics/metrics_test.go b/plugin/metrics/metrics_test.go
index b0e311b73..1f63a20b7 100644
--- a/plugin/metrics/metrics_test.go
+++ b/plugin/metrics/metrics_test.go
@@ -31,25 +31,25 @@ func TestMetrics(t *testing.T) {
 		{
 			next:          test.NextHandler(dns.RcodeSuccess, nil),
 			qname:         "example.org",
-			metric:        "coredns_dns_request_count_total",
+			metric:        "coredns_dns_requests_total",
 			expectedValue: "1",
 		},
 		{
 			next:          test.NextHandler(dns.RcodeSuccess, nil),
 			qname:         "example.org",
-			metric:        "coredns_dns_request_count_total",
+			metric:        "coredns_dns_requests_total",
 			expectedValue: "2",
 		},
 		{
 			next:          test.NextHandler(dns.RcodeSuccess, nil),
 			qname:         "example.org",
-			metric:        "coredns_dns_request_type_count_total",
+			metric:        "coredns_dns_requests_total",
 			expectedValue: "3",
 		},
 		{
 			next:          test.NextHandler(dns.RcodeSuccess, nil),
 			qname:         "example.org",
-			metric:        "coredns_dns_response_rcode_count_total",
+			metric:        "coredns_dns_responses_total",
 			expectedValue: "4",
 		},
 	}
diff --git a/plugin/metrics/vars/report.go b/plugin/metrics/vars/report.go
index b0a9950ce..96b29d9fb 100644
--- a/plugin/metrics/vars/report.go
+++ b/plugin/metrics/vars/report.go
@@ -20,17 +20,16 @@ func Report(server string, req request.Request, zone, rcode string, size int, st
 	}
 
 	typ := req.QType()
-	RequestCount.WithLabelValues(server, zone, net, fam).Inc()
 
 	if req.Do() {
 		RequestDo.WithLabelValues(server, zone).Inc()
 	}
 
 	if _, known := monitorType[typ]; known {
-		RequestType.WithLabelValues(server, zone, dns.Type(typ).String()).Inc()
+		RequestCount.WithLabelValues(server, zone, net, fam, dns.Type(typ).String()).Inc()
 		RequestDuration.WithLabelValues(server, zone, dns.Type(typ).String()).Observe(time.Since(start).Seconds())
 	} else {
-		RequestType.WithLabelValues(server, zone, other).Inc()
+		RequestCount.WithLabelValues(server, zone, net, fam, other).Inc()
 		RequestDuration.WithLabelValues(server, zone, other).Observe(time.Since(start).Seconds())
 	}
 
diff --git a/plugin/metrics/vars/vars.go b/plugin/metrics/vars/vars.go
index 6d896e88a..16e20280a 100644
--- a/plugin/metrics/vars/vars.go
+++ b/plugin/metrics/vars/vars.go
@@ -11,9 +11,9 @@ var (
 	RequestCount = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: subsystem,
-		Name:      "request_count_total",
+		Name:      "requests_total",
 		Help:      "Counter of DNS requests made per zone, protocol and family.",
-	}, []string{"server", "zone", "proto", "family"})
+	}, []string{"server", "zone", "proto", "family", "type"})
 
 	RequestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
 		Namespace: plugin.Namespace,
@@ -34,17 +34,10 @@ var (
 	RequestDo = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: subsystem,
-		Name:      "request_do_count_total",
+		Name:      "do_requests_total",
 		Help:      "Counter of DNS requests with DO bit set per zone.",
 	}, []string{"server", "zone"})
 
-	RequestType = prometheus.NewCounterVec(prometheus.CounterOpts{
-		Namespace: plugin.Namespace,
-		Subsystem: subsystem,
-		Name:      "request_type_count_total",
-		Help:      "Counter of DNS requests per type, per zone.",
-	}, []string{"server", "zone", "type"})
-
 	ResponseSize = prometheus.NewHistogramVec(prometheus.HistogramOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: subsystem,
@@ -56,13 +49,13 @@ var (
 	ResponseRcode = prometheus.NewCounterVec(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: subsystem,
-		Name:      "response_rcode_count_total",
+		Name:      "responses_total",
 		Help:      "Counter of response status codes.",
 	}, []string{"server", "zone", "rcode"})
 
 	Panic = prometheus.NewCounter(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
-		Name:      "panic_count_total",
+		Name:      "panics_total",
 		Help:      "A metrics that counts the number of panics.",
 	})
 
diff --git a/plugin/reload/README.md b/plugin/reload/README.md
index 9809cdbaa..2417357b4 100644
--- a/plugin/reload/README.md
+++ b/plugin/reload/README.md
@@ -98,7 +98,7 @@ CoreDNS v1.7.0 and later does parse the Corefile and supports detecting changes
 
  If monitoring is enabled (via the *prometheus* plugin) then the following metric is exported:
 
-* `coredns_reload_failed_count_total{}` - counts the number of failed reload attempts.
+* `coredns_reload_failed_total{}` - counts the number of failed reload attempts.
 * `coredns_reload_version_info{hash, value}` - record the hash value during reload.
 
 Currently the type of `hash` is "md5", the `value` is the returned hash value.
diff --git a/plugin/reload/metrics.go b/plugin/reload/metrics.go
index de46862a4..fbe9f5e7a 100644
--- a/plugin/reload/metrics.go
+++ b/plugin/reload/metrics.go
@@ -11,7 +11,7 @@ var (
 	FailedCount = prometheus.NewCounter(prometheus.CounterOpts{
 		Namespace: plugin.Namespace,
 		Subsystem: "reload",
-		Name:      "failed_count_total",
+		Name:      "failed_total",
 		Help:      "Counter of the number of failed reload attempts.",
 	})
 
diff --git a/test/metrics_test.go b/test/metrics_test.go
index cad1a0a87..e79370873 100644
--- a/test/metrics_test.go
+++ b/test/metrics_test.go
@@ -36,7 +36,7 @@ example.com:0 {
 }
 
 func TestMetricsRefused(t *testing.T) {
-	metricName := "coredns_dns_response_rcode_count_total"
+	metricName := "coredns_dns_responses_total"
 
 	corefile := `example.org:0 {
 	forward . 8.8.8.8:53
@@ -110,7 +110,7 @@ func TestMetricsAuto(t *testing.T) {
 		t.Fatalf("Could not send message: %s", err)
 	}
 
-	metricName := "coredns_dns_request_count_total" //{zone, proto, family}
+	metricName := "coredns_dns_requests_total" // {zone, proto, family, type}
 
 	data := test.Scrape("http://" + metrics.ListenAddr + "/metrics")
 	// Get the value for the metrics where the one of the labels values matches "example.org."