Cleanup metrics (#3776)

Cleanup a variety of metric issues.
* Eliminate department of redundancy "count_total" naming.
* Use the plural of the unit when appropriate. (ex, "requests")
* Remove label names from metric names where appropriate. (ex, "rcode")
* Simplify request metrics by consolidating type label in to the base
request counter.
* Re-generate man pages.

Signed-off-by: Ben Kochie <superq@gmail.com>

Co-authored-by: Ben Kochie <superq@gmail.com>
This commit is contained in:
Miek Gieben 2020-03-26 09:17:33 +01:00 committed by GitHub
parent eb23cce1a7
commit 19cfa2960c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 39 additions and 50 deletions

View file

@ -73,8 +73,8 @@ example.org {
If monitoring is enabled (via the _prometheus_ plugin) then the following metrics are exported:
- `coredns_request_block_count_total{server, zone}` - counter of DNS requests being blocked.
- `coredns_dns_blocked_requests_total{server, zone}` - counter of DNS requests being blocked.
- `coredns_request_allow_count_total{server}` - counter of DNS requests being allowed.
- `coredns_dns_allowed_requests_total{server}` - counter of DNS requests being allowed.
The `server` and `zone` labels are explained in the _metrics_ plugin documentation.

View file

@ -11,14 +11,14 @@ var (
RequestBlockCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Subsystem: "dns",
Name: "request_block_count_total",
Name: "acl_blocked_requests_total",
Help: "Counter of DNS requests being blocked.",
}, []string{"server", "zone"})
// RequestAllowCount is the number of DNS requests being Allowed.
RequestAllowCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Subsystem: "dns",
Name: "request_allow_count_total",
Name: "acl_allowed_requests_total",
Help: "Counter of DNS requests being allowed.",
}, []string{"server"})
)

View file

@ -31,7 +31,7 @@ If a plugin implements the `AutoPather` interface then it can be used.
If monitoring is enabled (via the *prometheus* plugin) then the following metric is exported:
* `coredns_autopath_success_count_total{server}` - counter of successfully autopath-ed queries.
* `coredns_autopath_success_total{server}` - counter of successfully autopath-ed queries.
The `server` label is explained in the *metrics* plugin documentation.

View file

@ -10,7 +10,7 @@ var (
autoPathCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Subsystem: "autopath",
Name: "success_count_total",
Name: "success_total",
Help: "Counter of requests that did autopath.",
}, []string{"server"})
)

View file

@ -108,12 +108,12 @@ On each endpoint, the timeouts of the communication are set by default and autom
If monitoring is enabled (via the *prometheus* plugin) then the following metric are exported:
* `coredns_forward_request_duration_seconds{to}` - duration per upstream interaction.
* `coredns_forward_request_count_total{to}` - query count per upstream.
* `coredns_forward_response_rcode_count_total{to, rcode}` - count of RCODEs per upstream.
* `coredns_forward_healthcheck_failure_count_total{to}` - number of failed health checks per upstream.
* `coredns_forward_healthcheck_broken_count_total{}` - counter of when all upstreams are unhealthy,
* `coredns_forward_requests_total{to}` - query count per upstream.
* `coredns_forward_responses_total{to, rcode}` - count of RCODEs per upstream.
* `coredns_forward_healthcheck_failures_total{to}` - number of failed health checks per upstream.
* `coredns_forward_healthcheck_broken_total{}` - counter of when all upstreams are unhealthy,
and we are randomly (this always uses the `random` policy) spraying to an upstream.
* `max_concurrent_reject_count_total{}` - counter of the number of queries rejected because the
* `max_concurrent_rejects_total{}` - counter of the number of queries rejected because the
number of concurrent queries were at maximum.
Where `to` is one of the upstream servers (**TO** from the config), `rcode` is the returned RCODE
from the upstream.

View file

@ -11,13 +11,13 @@ var (
RequestCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Subsystem: "forward",
Name: "request_count_total",
Name: "requests_total",
Help: "Counter of requests made per upstream.",
}, []string{"to"})
RcodeCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Subsystem: "forward",
Name: "response_rcode_count_total",
Name: "responses_total",
Help: "Counter of requests made per upstream.",
}, []string{"rcode", "to"})
RequestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
@ -30,13 +30,13 @@ var (
HealthcheckFailureCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Subsystem: "forward",
Name: "healthcheck_failure_count_total",
Name: "healthcheck_failures_total",
Help: "Counter of the number of failed healthchecks.",
}, []string{"to"})
HealthcheckBrokenCount = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Subsystem: "forward",
Name: "healthcheck_broken_count_total",
Name: "healthcheck_broken_total",
Help: "Counter of the number of complete failures of the healthchecks.",
})
SocketGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
@ -48,7 +48,7 @@ var (
MaxConcurrentRejectCount = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Subsystem: "forward",
Name: "max_concurrent_reject_count_total",
Name: "max_concurrent_rejects_total",
Help: "Counter of the number of queries rejected because the concurrent queries were at maximum.",
})
)

View file

@ -63,8 +63,8 @@ Also note the TLS config is "global" for the whole grpc proxy if you need a diff
If monitoring is enabled (via the *prometheus* plugin) then the following metric are exported:
* `coredns_grpc_request_duration_seconds{to}` - duration per upstream interaction.
* `coredns_grpc_request_count_total{to}` - query count per upstream.
* `coredns_grpc_response_rcode_count_total{to, rcode}` - count of RCODEs per upstream.
* `coredns_grpc_requests_total{to}` - query count per upstream.
* `coredns_grpc_responses_total{to, rcode}` - count of RCODEs per upstream.
and we are randomly (this always uses the `random` policy) spraying to an upstream.
## Examples

View file

@ -11,13 +11,13 @@ var (
RequestCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Subsystem: "grpc",
Name: "request_count_total",
Name: "requests_total",
Help: "Counter of requests made per upstream.",
}, []string{"to"})
RcodeCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Subsystem: "grpc",
Name: "response_rcode_count_total",
Name: "responses_total",
Help: "Counter of requests made per upstream.",
}, []string{"rcode", "to"})
RequestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{

View file

@ -11,14 +11,13 @@ The default location for the metrics is `localhost:9153`. The metrics path is fi
The following metrics are exported:
* `coredns_build_info{version, revision, goversion}` - info about CoreDNS itself.
* `coredns_panic_count_total{}` - total number of panics.
* `coredns_dns_request_count_total{server, zone, proto, family}` - total query count.
* `coredns_panics_total{}` - total number of panics.
* `coredns_dns_requests_total{server, zone, proto, family, type}` - total query count.
* `coredns_dns_request_duration_seconds{server, zone, type}` - duration to process each query.
* `coredns_dns_request_size_bytes{server, zone, proto}` - size of the request in bytes.
* `coredns_dns_request_do_count_total{server, zone}` - queries that have the DO bit set
* `coredns_dns_request_type_count_total{server, zone, type}` - counter of queries per zone and type.
* `coredns_dns_do_requests_total{server, zone}` - queries that have the DO bit set
* `coredns_dns_response_size_bytes{server, zone, proto}` - response size in bytes.
* `coredns_dns_response_rcode_count_total{server, zone, rcode}` - response per zone and rcode.
* `coredns_dns_responses_total{server, zone, rcode}` - response per zone and rcode.
* `coredns_plugin_enabled{server, zone, name}` - indicates whether a plugin is enabled on per server and zone basis.
Each counter has a label `zone` which is the zonename used for the request/response.
@ -33,7 +32,6 @@ Extra labels used are:
* `type` which holds the query type. It holds most common types (A, AAAA, MX, SOA, CNAME, PTR, TXT,
NS, SRV, DS, DNSKEY, RRSIG, NSEC, NSEC3, IXFR, AXFR and ANY) and "other" which lumps together all
other types.
* The `response_rcode_count_total` has an extra label `rcode` which holds the rcode of the response.
If monitoring is enabled, queries that do not enter the plugin chain are exported under the fake
name "dropped" (without a closing dot - this is never a valid domain name).

View file

@ -51,7 +51,6 @@ func New(addr string) *Metrics {
met.MustRegister(vars.RequestDuration)
met.MustRegister(vars.RequestSize)
met.MustRegister(vars.RequestDo)
met.MustRegister(vars.RequestType)
met.MustRegister(vars.ResponseSize)
met.MustRegister(vars.ResponseRcode)
met.MustRegister(vars.PluginEnabled)

View file

@ -31,25 +31,25 @@ func TestMetrics(t *testing.T) {
{
next: test.NextHandler(dns.RcodeSuccess, nil),
qname: "example.org",
metric: "coredns_dns_request_count_total",
metric: "coredns_dns_requests_total",
expectedValue: "1",
},
{
next: test.NextHandler(dns.RcodeSuccess, nil),
qname: "example.org",
metric: "coredns_dns_request_count_total",
metric: "coredns_dns_requests_total",
expectedValue: "2",
},
{
next: test.NextHandler(dns.RcodeSuccess, nil),
qname: "example.org",
metric: "coredns_dns_request_type_count_total",
metric: "coredns_dns_requests_total",
expectedValue: "3",
},
{
next: test.NextHandler(dns.RcodeSuccess, nil),
qname: "example.org",
metric: "coredns_dns_response_rcode_count_total",
metric: "coredns_dns_responses_total",
expectedValue: "4",
},
}

View file

@ -20,17 +20,16 @@ func Report(server string, req request.Request, zone, rcode string, size int, st
}
typ := req.QType()
RequestCount.WithLabelValues(server, zone, net, fam).Inc()
if req.Do() {
RequestDo.WithLabelValues(server, zone).Inc()
}
if _, known := monitorType[typ]; known {
RequestType.WithLabelValues(server, zone, dns.Type(typ).String()).Inc()
RequestCount.WithLabelValues(server, zone, net, fam, dns.Type(typ).String()).Inc()
RequestDuration.WithLabelValues(server, zone, dns.Type(typ).String()).Observe(time.Since(start).Seconds())
} else {
RequestType.WithLabelValues(server, zone, other).Inc()
RequestCount.WithLabelValues(server, zone, net, fam, other).Inc()
RequestDuration.WithLabelValues(server, zone, other).Observe(time.Since(start).Seconds())
}

View file

@ -11,9 +11,9 @@ var (
RequestCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Subsystem: subsystem,
Name: "request_count_total",
Name: "requests_total",
Help: "Counter of DNS requests made per zone, protocol and family.",
}, []string{"server", "zone", "proto", "family"})
}, []string{"server", "zone", "proto", "family", "type"})
RequestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: plugin.Namespace,
@ -34,17 +34,10 @@ var (
RequestDo = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Subsystem: subsystem,
Name: "request_do_count_total",
Name: "do_requests_total",
Help: "Counter of DNS requests with DO bit set per zone.",
}, []string{"server", "zone"})
RequestType = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Subsystem: subsystem,
Name: "request_type_count_total",
Help: "Counter of DNS requests per type, per zone.",
}, []string{"server", "zone", "type"})
ResponseSize = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: plugin.Namespace,
Subsystem: subsystem,
@ -56,13 +49,13 @@ var (
ResponseRcode = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Subsystem: subsystem,
Name: "response_rcode_count_total",
Name: "responses_total",
Help: "Counter of response status codes.",
}, []string{"server", "zone", "rcode"})
Panic = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Name: "panic_count_total",
Name: "panics_total",
Help: "A metrics that counts the number of panics.",
})

View file

@ -98,7 +98,7 @@ CoreDNS v1.7.0 and later does parse the Corefile and supports detecting changes
If monitoring is enabled (via the *prometheus* plugin) then the following metric is exported:
* `coredns_reload_failed_count_total{}` - counts the number of failed reload attempts.
* `coredns_reload_failed_total{}` - counts the number of failed reload attempts.
* `coredns_reload_version_info{hash, value}` - record the hash value during reload.
Currently the type of `hash` is "md5", the `value` is the returned hash value.

View file

@ -11,7 +11,7 @@ var (
FailedCount = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Subsystem: "reload",
Name: "failed_count_total",
Name: "failed_total",
Help: "Counter of the number of failed reload attempts.",
})

View file

@ -36,7 +36,7 @@ example.com:0 {
}
func TestMetricsRefused(t *testing.T) {
metricName := "coredns_dns_response_rcode_count_total"
metricName := "coredns_dns_responses_total"
corefile := `example.org:0 {
forward . 8.8.8.8:53
@ -110,7 +110,7 @@ func TestMetricsAuto(t *testing.T) {
t.Fatalf("Could not send message: %s", err)
}
metricName := "coredns_dns_request_count_total" //{zone, proto, family}
metricName := "coredns_dns_requests_total" // {zone, proto, family, type}
data := test.Scrape("http://" + metrics.ListenAddr + "/metrics")
// Get the value for the metrics where the one of the labels values matches "example.org."