Metrics: expand coredns_dns_responses_total with plugin label (#4914)

* Metrics: expand coredns_dns_responses_total with plugin label

This adds (somewhat hacky?) code to add a plugin label to the
coredns_dns_responses_total metric. It's completely obvlious to the
plugin as we just check who called the *recorder.WriteMsg method. We use
runtime.Caller( 1 2 3) to get multiple levels of callers, this should be
deep enough, but it depends on the dns.ResponseWriter wrapping that's
occuring.

README.md of metrics updates and test added in test/metrics_test.go to
check for the label being set.

I went through the plugin to see what metrics could be removed, but
actually didn't find any, the plugin push out metrics that make sense.

Due to the path fiddling to figure out the plugin name I doubt this
works (out-of-the-box) for external plugins, but I haven't tested that.

Signed-off-by: Miek Gieben <miek@miek.nl>

* better comment

Signed-off-by: Miek Gieben <miek@miek.nl>

* Metrics: expand coredns_dns_responses_total with plugin label

This adds (somewhat hacky?) code to add a plugin label to the
coredns_dns_responses_total metric. It's completely obvlious to the
plugin as we just check who called the *recorder.WriteMsg method. We use
runtime.Caller( 1 2 3) to get multiple levels of callers, this should be
deep enough, but it depends on the dns.ResponseWriter wrapping that's
occuring.

README.md of metrics updates and test added in test/metrics_test.go to
check for the label being set.

I went through the plugin to see what metrics could be removed, but
actually didn't find any, the plugin push out metrics that make sense.

Due to the path fiddling to figure out the plugin name I doubt this
works (out-of-the-box) for external plugins, but I haven't tested that.

Signed-off-by: Miek Gieben <miek@miek.nl>

* Update core/dnsserver/server.go

Co-authored-by: dilyevsky <ilyevsky@gmail.com>

* Use [3]string

Signed-off-by: Miek Gieben <miek@miek.nl>

* imports

Signed-off-by: Miek Gieben <miek@miek.nl>

* remove dnstest changes

Signed-off-by: Miek Gieben <miek@miek.nl>

* revert

Signed-off-by: Miek Gieben <miek@miek.nl>

* Add some sleeps to make it less flaky

Signed-off-by: Miek Gieben <miek@miek.nl>

* Revert "Add some sleeps to make it less flaky"

This reverts commit b5c6655196.

* Remove forward when not needed

Signed-off-by: Miek Gieben <miek@miek.nl>

* remove newline

Signed-off-by: Miek Gieben <miek@miek.nl>

Co-authored-by: dilyevsky <ilyevsky@gmail.com>
This commit is contained in:
Miek Gieben 2021-11-12 16:07:05 +00:00 committed by GitHub
parent 29cae57932
commit 6953ab2b4f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 85 additions and 20 deletions

View file

@ -328,7 +328,7 @@ func errorAndMetricsFunc(server string, w dns.ResponseWriter, r *dns.Msg, rc int
answer.SetRcode(r, rc)
state.SizeAndDo(answer)
vars.Report(server, state, vars.Dropped, rcode.ToString(rc), answer.Len(), time.Now())
vars.Report(server, state, vars.Dropped, rcode.ToString(rc), "" /* plugin */, answer.Len(), time.Now())
w.WriteMsg(answer)
}

View file

@ -17,7 +17,7 @@ The following metrics are exported:
* `coredns_dns_request_size_bytes{server, zone, proto}` - size of the request in bytes.
* `coredns_dns_do_requests_total{server, zone}` - queries that have the DO bit set
* `coredns_dns_response_size_bytes{server, zone, proto}` - response size in bytes.
* `coredns_dns_responses_total{server, zone, rcode}` - response per zone and rcode.
* `coredns_dns_responses_total{server, zone, rcode, plugin}` - response per zone, rcode and plugin.
* `coredns_plugin_enabled{server, zone, name}` - indicates whether a plugin is enabled on per server and zone basis.
Each counter has a label `zone` which is the zonename used for the request/response.
@ -32,6 +32,8 @@ Extra labels used are:
* `type` which holds the query type. It holds most common types (A, AAAA, MX, SOA, CNAME, PTR, TXT,
NS, SRV, DS, DNSKEY, RRSIG, NSEC, NSEC3, HTTPS, IXFR, AXFR and ANY) and "other" which lumps together all
other types.
* the `plugin` label holds the name of the plugin that made the write to the client. If the server
did the write (on error for instance), the value is empty.
If monitoring is enabled, queries that do not enter the plugin chain are exported under the fake
name "dropped" (without a closing dot - this is never a valid domain name).

View file

@ -2,10 +2,10 @@ package metrics
import (
"context"
"path/filepath"
"github.com/coredns/coredns/plugin"
"github.com/coredns/coredns/plugin/metrics/vars"
"github.com/coredns/coredns/plugin/pkg/dnstest"
"github.com/coredns/coredns/plugin/pkg/rcode"
"github.com/coredns/coredns/request"
@ -23,7 +23,7 @@ func (m *Metrics) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg
}
// Record response to get status code and size of the reply.
rw := dnstest.NewRecorder(w)
rw := NewRecorder(w)
status, err := plugin.NextOrFailure(m.Name(), m.Next, ctx, rw, r)
rc := rw.Rcode
@ -33,10 +33,25 @@ func (m *Metrics) ServeDNS(ctx context.Context, w dns.ResponseWriter, r *dns.Msg
// see https://github.com/coredns/coredns/blob/master/core/dnsserver/server.go#L318
rc = status
}
vars.Report(WithServer(ctx), state, zone, rcode.ToString(rc), rw.Len, rw.Start)
plugin := m.authoritativePlugin(rw.Caller)
vars.Report(WithServer(ctx), state, zone, rcode.ToString(rc), plugin, rw.Len, rw.Start)
return status, err
}
// Name implements the Handler interface.
func (m *Metrics) Name() string { return "prometheus" }
// authoritativePlugin returns which of made the write, if none is found the empty string is returned.
func (m *Metrics) authoritativePlugin(caller [3]string) string {
// a b and c contain the full path of the caller, the plugin name 2nd last elements
// .../coredns/plugin/whoami/whoami.go --> whoami
// this is likely FS specific, so use filepath.
for _, c := range caller {
plug := filepath.Base(filepath.Dir(c))
if _, ok := m.plugins[plug]; ok {
return plug
}
}
return ""
}

View file

@ -8,6 +8,7 @@ import (
"sync"
"time"
"github.com/coredns/caddy"
"github.com/coredns/coredns/plugin"
"github.com/coredns/coredns/plugin/pkg/reuseport"
@ -31,6 +32,8 @@ type Metrics struct {
zoneNames []string
zoneMap map[string]struct{}
zoneMu sync.RWMutex
plugins map[string]struct{} // all available plugins, used to determine which plugin made the client write
}
// New returns a new instance of Metrics with the given address.
@ -39,6 +42,7 @@ func New(addr string) *Metrics {
Addr: addr,
Reg: prometheus.DefaultRegisterer.(*prometheus.Registry),
zoneMap: make(map[string]struct{}),
plugins: pluginList(caddy.ListPlugins()),
}
return met
@ -140,6 +144,19 @@ func keys(m map[string]struct{}) []string {
return sx
}
// pluginList iterates over the returned plugin map from caddy and removes the "dns." prefix from them.
func pluginList(m map[string][]string) map[string]struct{} {
pm := map[string]struct{}{}
for _, p := range m["others"] {
// only add 'dns.' plugins
if len(p) > 3 {
pm[p[4:]] = struct{}{}
continue
}
}
return pm
}
// ListenAddr is assigned the address of the prometheus listener. Its use is mainly in tests where
// we listen on "localhost:0" and need to retrieve the actual address.
var ListenAddr string

View file

@ -0,0 +1,30 @@
package metrics
import (
"runtime"
"github.com/coredns/coredns/plugin/pkg/dnstest"
"github.com/miekg/dns"
)
// Recorder is a dnstest.Recorder specific to the metrics plugin.
type Recorder struct {
*dnstest.Recorder
// CallerN holds the string return value of the call to runtime.Caller(N+1)
Caller [3]string
}
// NewRecorder makes and returns a new Recorder.
func NewRecorder(w dns.ResponseWriter) *Recorder { return &Recorder{Recorder: dnstest.NewRecorder(w)} }
// WriteMsg records the status code and calls the
// underlying ResponseWriter's WriteMsg method.
func (r *Recorder) WriteMsg(res *dns.Msg) error {
_, r.Caller[0], _, _ = runtime.Caller(1)
_, r.Caller[1], _, _ = runtime.Caller(2)
_, r.Caller[2], _, _ = runtime.Caller(3)
r.Len += res.Len()
r.Msg = res
return r.ResponseWriter.WriteMsg(res)
}

View file

@ -9,7 +9,7 @@ import (
// Report reports the metrics data associated with request. This function is exported because it is also
// called from core/dnsserver to report requests hitting the server that should not be handled and are thus
// not sent down the plugin chain.
func Report(server string, req request.Request, zone, rcode string, size int, start time.Time) {
func Report(server string, req request.Request, zone, rcode, plugin string, size int, start time.Time) {
// Proto and Family.
net := req.Proto()
fam := "1"
@ -29,5 +29,5 @@ func Report(server string, req request.Request, zone, rcode string, size int, st
ResponseSize.WithLabelValues(server, zone, net).Observe(float64(size))
RequestSize.WithLabelValues(server, zone, net).Observe(float64(req.Len()))
ResponseRcode.WithLabelValues(server, zone, rcode).Inc()
ResponseRcode.WithLabelValues(server, zone, rcode, plugin).Inc()
}

View file

@ -52,7 +52,7 @@ var (
Subsystem: subsystem,
Name: "responses_total",
Help: "Counter of response status codes.",
}, []string{"server", "zone", "rcode"})
}, []string{"server", "zone", "rcode", "plugin"})
Panic = promauto.NewCounter(prometheus.CounterOpts{
Namespace: plugin.Namespace,

View file

@ -14,6 +14,9 @@ import (
"github.com/miekg/dns"
)
// Because we don't properly shutdown the metrics servers we are re-using the metrics between tests, not a superbad issue
// but depending on the ordering of the tests this trips up stuff.
// Start test server that has metrics enabled. Then tear it down again.
func TestMetricsServer(t *testing.T) {
corefile := `
@ -22,7 +25,7 @@ func TestMetricsServer(t *testing.T) {
prometheus localhost:0
}
example.com:0 {
forward . 8.8.4.4:53
log
prometheus localhost:0
}`
@ -36,7 +39,7 @@ func TestMetricsServer(t *testing.T) {
func TestMetricsRefused(t *testing.T) {
metricName := "coredns_dns_responses_total"
corefile := `example.org:0 {
forward . 8.8.8.8:53
whoami
prometheus localhost:0
}`
@ -112,8 +115,8 @@ func TestMetricsAuto(t *testing.T) {
// Get the value for the metrics where the one of the labels values matches "example.org."
got, _ := test.MetricValueLabel(metricName, "example.org.", data)
if got != "1" {
t.Errorf("Expected value %s for %s, but got %s", "1", metricName, got)
if got == "0" {
t.Errorf("Expected value %s for %s, but got %s", "> 1", metricName, got)
}
// Remove db.example.org again. And see if the metric stops increasing.
@ -126,8 +129,8 @@ func TestMetricsAuto(t *testing.T) {
data = test.Scrape("http://" + metrics.ListenAddr + "/metrics")
got, _ = test.MetricValueLabel(metricName, "example.org.", data)
if got != "1" {
t.Errorf("Expected value %s for %s, but got %s", "1", metricName, got)
if got == "0" {
t.Errorf("Expected value %s for %s, but got %s", "> 1", metricName, got)
}
}
@ -145,9 +148,7 @@ func TestMetricsSeveralBlocs(t *testing.T) {
}
google.com:0 {
prometheus ` + addrMetrics + `
forward . 8.8.8.8:53 {
force_tcp
}
whoami
cache
}`
@ -190,7 +191,7 @@ func TestMetricsPluginEnabled(t *testing.T) {
prometheus localhost:0
}
example.com:0 {
forward . 8.8.4.4:53
whoami
prometheus localhost:0
}`
@ -211,8 +212,8 @@ func TestMetricsPluginEnabled(t *testing.T) {
t.Errorf("Expected value %s for %s, but got %s", "1", metricName, got)
}
// Get the value for the metrics where the one of the labels values matches "whoami".
got, _ = test.MetricValueLabel(metricName, "whoami", data)
// Get the value for the metrics where the one of the labels values matches "erratic".
got, _ = test.MetricValueLabel(metricName, "erratic", data) // none of these tests use 'erratic'
if got != "" {
t.Errorf("Expected value %s for %s, but got %s", "", metricName, got)