Cleanup metrics (#3776)
Cleanup a variety of metric issues. * Eliminate department of redundancy "count_total" naming. * Use the plural of the unit when appropriate. (ex, "requests") * Remove label names from metric names where appropriate. (ex, "rcode") * Simplify request metrics by consolidating type label in to the base request counter. * Re-generate man pages. Signed-off-by: Ben Kochie <superq@gmail.com> Co-authored-by: Ben Kochie <superq@gmail.com>
This commit is contained in:
parent
eb23cce1a7
commit
19cfa2960c
16 changed files with 39 additions and 50 deletions
|
@ -73,8 +73,8 @@ example.org {
|
|||
|
||||
If monitoring is enabled (via the _prometheus_ plugin) then the following metrics are exported:
|
||||
|
||||
- `coredns_request_block_count_total{server, zone}` - counter of DNS requests being blocked.
|
||||
- `coredns_dns_blocked_requests_total{server, zone}` - counter of DNS requests being blocked.
|
||||
|
||||
- `coredns_request_allow_count_total{server}` - counter of DNS requests being allowed.
|
||||
- `coredns_dns_allowed_requests_total{server}` - counter of DNS requests being allowed.
|
||||
|
||||
The `server` and `zone` labels are explained in the _metrics_ plugin documentation.
|
||||
|
|
|
@ -11,14 +11,14 @@ var (
|
|||
RequestBlockCount = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: plugin.Namespace,
|
||||
Subsystem: "dns",
|
||||
Name: "request_block_count_total",
|
||||
Name: "acl_blocked_requests_total",
|
||||
Help: "Counter of DNS requests being blocked.",
|
||||
}, []string{"server", "zone"})
|
||||
// RequestAllowCount is the number of DNS requests being Allowed.
|
||||
RequestAllowCount = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: plugin.Namespace,
|
||||
Subsystem: "dns",
|
||||
Name: "request_allow_count_total",
|
||||
Name: "acl_allowed_requests_total",
|
||||
Help: "Counter of DNS requests being allowed.",
|
||||
}, []string{"server"})
|
||||
)
|
||||
|
|
|
@ -31,7 +31,7 @@ If a plugin implements the `AutoPather` interface then it can be used.
|
|||
|
||||
If monitoring is enabled (via the *prometheus* plugin) then the following metric is exported:
|
||||
|
||||
* `coredns_autopath_success_count_total{server}` - counter of successfully autopath-ed queries.
|
||||
* `coredns_autopath_success_total{server}` - counter of successfully autopath-ed queries.
|
||||
|
||||
The `server` label is explained in the *metrics* plugin documentation.
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@ var (
|
|||
autoPathCount = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: plugin.Namespace,
|
||||
Subsystem: "autopath",
|
||||
Name: "success_count_total",
|
||||
Name: "success_total",
|
||||
Help: "Counter of requests that did autopath.",
|
||||
}, []string{"server"})
|
||||
)
|
||||
|
|
|
@ -108,12 +108,12 @@ On each endpoint, the timeouts of the communication are set by default and autom
|
|||
If monitoring is enabled (via the *prometheus* plugin) then the following metric are exported:
|
||||
|
||||
* `coredns_forward_request_duration_seconds{to}` - duration per upstream interaction.
|
||||
* `coredns_forward_request_count_total{to}` - query count per upstream.
|
||||
* `coredns_forward_response_rcode_count_total{to, rcode}` - count of RCODEs per upstream.
|
||||
* `coredns_forward_healthcheck_failure_count_total{to}` - number of failed health checks per upstream.
|
||||
* `coredns_forward_healthcheck_broken_count_total{}` - counter of when all upstreams are unhealthy,
|
||||
* `coredns_forward_requests_total{to}` - query count per upstream.
|
||||
* `coredns_forward_responses_total{to, rcode}` - count of RCODEs per upstream.
|
||||
* `coredns_forward_healthcheck_failures_total{to}` - number of failed health checks per upstream.
|
||||
* `coredns_forward_healthcheck_broken_total{}` - counter of when all upstreams are unhealthy,
|
||||
and we are randomly (this always uses the `random` policy) spraying to an upstream.
|
||||
* `max_concurrent_reject_count_total{}` - counter of the number of queries rejected because the
|
||||
* `max_concurrent_rejects_total{}` - counter of the number of queries rejected because the
|
||||
number of concurrent queries were at maximum.
|
||||
Where `to` is one of the upstream servers (**TO** from the config), `rcode` is the returned RCODE
|
||||
from the upstream.
|
||||
|
|
|
@ -11,13 +11,13 @@ var (
|
|||
RequestCount = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: plugin.Namespace,
|
||||
Subsystem: "forward",
|
||||
Name: "request_count_total",
|
||||
Name: "requests_total",
|
||||
Help: "Counter of requests made per upstream.",
|
||||
}, []string{"to"})
|
||||
RcodeCount = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: plugin.Namespace,
|
||||
Subsystem: "forward",
|
||||
Name: "response_rcode_count_total",
|
||||
Name: "responses_total",
|
||||
Help: "Counter of requests made per upstream.",
|
||||
}, []string{"rcode", "to"})
|
||||
RequestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||||
|
@ -30,13 +30,13 @@ var (
|
|||
HealthcheckFailureCount = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: plugin.Namespace,
|
||||
Subsystem: "forward",
|
||||
Name: "healthcheck_failure_count_total",
|
||||
Name: "healthcheck_failures_total",
|
||||
Help: "Counter of the number of failed healthchecks.",
|
||||
}, []string{"to"})
|
||||
HealthcheckBrokenCount = prometheus.NewCounter(prometheus.CounterOpts{
|
||||
Namespace: plugin.Namespace,
|
||||
Subsystem: "forward",
|
||||
Name: "healthcheck_broken_count_total",
|
||||
Name: "healthcheck_broken_total",
|
||||
Help: "Counter of the number of complete failures of the healthchecks.",
|
||||
})
|
||||
SocketGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
|
@ -48,7 +48,7 @@ var (
|
|||
MaxConcurrentRejectCount = prometheus.NewCounter(prometheus.CounterOpts{
|
||||
Namespace: plugin.Namespace,
|
||||
Subsystem: "forward",
|
||||
Name: "max_concurrent_reject_count_total",
|
||||
Name: "max_concurrent_rejects_total",
|
||||
Help: "Counter of the number of queries rejected because the concurrent queries were at maximum.",
|
||||
})
|
||||
)
|
||||
|
|
|
@ -63,8 +63,8 @@ Also note the TLS config is "global" for the whole grpc proxy if you need a diff
|
|||
If monitoring is enabled (via the *prometheus* plugin) then the following metric are exported:
|
||||
|
||||
* `coredns_grpc_request_duration_seconds{to}` - duration per upstream interaction.
|
||||
* `coredns_grpc_request_count_total{to}` - query count per upstream.
|
||||
* `coredns_grpc_response_rcode_count_total{to, rcode}` - count of RCODEs per upstream.
|
||||
* `coredns_grpc_requests_total{to}` - query count per upstream.
|
||||
* `coredns_grpc_responses_total{to, rcode}` - count of RCODEs per upstream.
|
||||
and we are randomly (this always uses the `random` policy) spraying to an upstream.
|
||||
|
||||
## Examples
|
||||
|
|
|
@ -11,13 +11,13 @@ var (
|
|||
RequestCount = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: plugin.Namespace,
|
||||
Subsystem: "grpc",
|
||||
Name: "request_count_total",
|
||||
Name: "requests_total",
|
||||
Help: "Counter of requests made per upstream.",
|
||||
}, []string{"to"})
|
||||
RcodeCount = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: plugin.Namespace,
|
||||
Subsystem: "grpc",
|
||||
Name: "response_rcode_count_total",
|
||||
Name: "responses_total",
|
||||
Help: "Counter of requests made per upstream.",
|
||||
}, []string{"rcode", "to"})
|
||||
RequestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||||
|
|
|
@ -11,14 +11,13 @@ The default location for the metrics is `localhost:9153`. The metrics path is fi
|
|||
The following metrics are exported:
|
||||
|
||||
* `coredns_build_info{version, revision, goversion}` - info about CoreDNS itself.
|
||||
* `coredns_panic_count_total{}` - total number of panics.
|
||||
* `coredns_dns_request_count_total{server, zone, proto, family}` - total query count.
|
||||
* `coredns_panics_total{}` - total number of panics.
|
||||
* `coredns_dns_requests_total{server, zone, proto, family, type}` - total query count.
|
||||
* `coredns_dns_request_duration_seconds{server, zone, type}` - duration to process each query.
|
||||
* `coredns_dns_request_size_bytes{server, zone, proto}` - size of the request in bytes.
|
||||
* `coredns_dns_request_do_count_total{server, zone}` - queries that have the DO bit set
|
||||
* `coredns_dns_request_type_count_total{server, zone, type}` - counter of queries per zone and type.
|
||||
* `coredns_dns_do_requests_total{server, zone}` - queries that have the DO bit set
|
||||
* `coredns_dns_response_size_bytes{server, zone, proto}` - response size in bytes.
|
||||
* `coredns_dns_response_rcode_count_total{server, zone, rcode}` - response per zone and rcode.
|
||||
* `coredns_dns_responses_total{server, zone, rcode}` - response per zone and rcode.
|
||||
* `coredns_plugin_enabled{server, zone, name}` - indicates whether a plugin is enabled on per server and zone basis.
|
||||
|
||||
Each counter has a label `zone` which is the zonename used for the request/response.
|
||||
|
@ -33,7 +32,6 @@ Extra labels used are:
|
|||
* `type` which holds the query type. It holds most common types (A, AAAA, MX, SOA, CNAME, PTR, TXT,
|
||||
NS, SRV, DS, DNSKEY, RRSIG, NSEC, NSEC3, IXFR, AXFR and ANY) and "other" which lumps together all
|
||||
other types.
|
||||
* The `response_rcode_count_total` has an extra label `rcode` which holds the rcode of the response.
|
||||
|
||||
If monitoring is enabled, queries that do not enter the plugin chain are exported under the fake
|
||||
name "dropped" (without a closing dot - this is never a valid domain name).
|
||||
|
|
|
@ -51,7 +51,6 @@ func New(addr string) *Metrics {
|
|||
met.MustRegister(vars.RequestDuration)
|
||||
met.MustRegister(vars.RequestSize)
|
||||
met.MustRegister(vars.RequestDo)
|
||||
met.MustRegister(vars.RequestType)
|
||||
met.MustRegister(vars.ResponseSize)
|
||||
met.MustRegister(vars.ResponseRcode)
|
||||
met.MustRegister(vars.PluginEnabled)
|
||||
|
|
|
@ -31,25 +31,25 @@ func TestMetrics(t *testing.T) {
|
|||
{
|
||||
next: test.NextHandler(dns.RcodeSuccess, nil),
|
||||
qname: "example.org",
|
||||
metric: "coredns_dns_request_count_total",
|
||||
metric: "coredns_dns_requests_total",
|
||||
expectedValue: "1",
|
||||
},
|
||||
{
|
||||
next: test.NextHandler(dns.RcodeSuccess, nil),
|
||||
qname: "example.org",
|
||||
metric: "coredns_dns_request_count_total",
|
||||
metric: "coredns_dns_requests_total",
|
||||
expectedValue: "2",
|
||||
},
|
||||
{
|
||||
next: test.NextHandler(dns.RcodeSuccess, nil),
|
||||
qname: "example.org",
|
||||
metric: "coredns_dns_request_type_count_total",
|
||||
metric: "coredns_dns_requests_total",
|
||||
expectedValue: "3",
|
||||
},
|
||||
{
|
||||
next: test.NextHandler(dns.RcodeSuccess, nil),
|
||||
qname: "example.org",
|
||||
metric: "coredns_dns_response_rcode_count_total",
|
||||
metric: "coredns_dns_responses_total",
|
||||
expectedValue: "4",
|
||||
},
|
||||
}
|
||||
|
|
|
@ -20,17 +20,16 @@ func Report(server string, req request.Request, zone, rcode string, size int, st
|
|||
}
|
||||
|
||||
typ := req.QType()
|
||||
RequestCount.WithLabelValues(server, zone, net, fam).Inc()
|
||||
|
||||
if req.Do() {
|
||||
RequestDo.WithLabelValues(server, zone).Inc()
|
||||
}
|
||||
|
||||
if _, known := monitorType[typ]; known {
|
||||
RequestType.WithLabelValues(server, zone, dns.Type(typ).String()).Inc()
|
||||
RequestCount.WithLabelValues(server, zone, net, fam, dns.Type(typ).String()).Inc()
|
||||
RequestDuration.WithLabelValues(server, zone, dns.Type(typ).String()).Observe(time.Since(start).Seconds())
|
||||
} else {
|
||||
RequestType.WithLabelValues(server, zone, other).Inc()
|
||||
RequestCount.WithLabelValues(server, zone, net, fam, other).Inc()
|
||||
RequestDuration.WithLabelValues(server, zone, other).Observe(time.Since(start).Seconds())
|
||||
}
|
||||
|
||||
|
|
|
@ -11,9 +11,9 @@ var (
|
|||
RequestCount = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: plugin.Namespace,
|
||||
Subsystem: subsystem,
|
||||
Name: "request_count_total",
|
||||
Name: "requests_total",
|
||||
Help: "Counter of DNS requests made per zone, protocol and family.",
|
||||
}, []string{"server", "zone", "proto", "family"})
|
||||
}, []string{"server", "zone", "proto", "family", "type"})
|
||||
|
||||
RequestDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||||
Namespace: plugin.Namespace,
|
||||
|
@ -34,17 +34,10 @@ var (
|
|||
RequestDo = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: plugin.Namespace,
|
||||
Subsystem: subsystem,
|
||||
Name: "request_do_count_total",
|
||||
Name: "do_requests_total",
|
||||
Help: "Counter of DNS requests with DO bit set per zone.",
|
||||
}, []string{"server", "zone"})
|
||||
|
||||
RequestType = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: plugin.Namespace,
|
||||
Subsystem: subsystem,
|
||||
Name: "request_type_count_total",
|
||||
Help: "Counter of DNS requests per type, per zone.",
|
||||
}, []string{"server", "zone", "type"})
|
||||
|
||||
ResponseSize = prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||||
Namespace: plugin.Namespace,
|
||||
Subsystem: subsystem,
|
||||
|
@ -56,13 +49,13 @@ var (
|
|||
ResponseRcode = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: plugin.Namespace,
|
||||
Subsystem: subsystem,
|
||||
Name: "response_rcode_count_total",
|
||||
Name: "responses_total",
|
||||
Help: "Counter of response status codes.",
|
||||
}, []string{"server", "zone", "rcode"})
|
||||
|
||||
Panic = prometheus.NewCounter(prometheus.CounterOpts{
|
||||
Namespace: plugin.Namespace,
|
||||
Name: "panic_count_total",
|
||||
Name: "panics_total",
|
||||
Help: "A metrics that counts the number of panics.",
|
||||
})
|
||||
|
||||
|
|
|
@ -98,7 +98,7 @@ CoreDNS v1.7.0 and later does parse the Corefile and supports detecting changes
|
|||
|
||||
If monitoring is enabled (via the *prometheus* plugin) then the following metric is exported:
|
||||
|
||||
* `coredns_reload_failed_count_total{}` - counts the number of failed reload attempts.
|
||||
* `coredns_reload_failed_total{}` - counts the number of failed reload attempts.
|
||||
* `coredns_reload_version_info{hash, value}` - record the hash value during reload.
|
||||
|
||||
Currently the type of `hash` is "md5", the `value` is the returned hash value.
|
||||
|
|
|
@ -11,7 +11,7 @@ var (
|
|||
FailedCount = prometheus.NewCounter(prometheus.CounterOpts{
|
||||
Namespace: plugin.Namespace,
|
||||
Subsystem: "reload",
|
||||
Name: "failed_count_total",
|
||||
Name: "failed_total",
|
||||
Help: "Counter of the number of failed reload attempts.",
|
||||
})
|
||||
|
||||
|
|
|
@ -36,7 +36,7 @@ example.com:0 {
|
|||
}
|
||||
|
||||
func TestMetricsRefused(t *testing.T) {
|
||||
metricName := "coredns_dns_response_rcode_count_total"
|
||||
metricName := "coredns_dns_responses_total"
|
||||
|
||||
corefile := `example.org:0 {
|
||||
forward . 8.8.8.8:53
|
||||
|
@ -110,7 +110,7 @@ func TestMetricsAuto(t *testing.T) {
|
|||
t.Fatalf("Could not send message: %s", err)
|
||||
}
|
||||
|
||||
metricName := "coredns_dns_request_count_total" //{zone, proto, family}
|
||||
metricName := "coredns_dns_requests_total" // {zone, proto, family, type}
|
||||
|
||||
data := test.Scrape("http://" + metrics.ListenAddr + "/metrics")
|
||||
// Get the value for the metrics where the one of the labels values matches "example.org."
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue