coredns/plugin/health/overloaded.go
Ben Kochie 9edfaed631
Reduce the cardinality of health endpoint metrics (#4650)
The health endpoint histogram has a large amount of cardinality for a
simple endpoint. Introduce a new "Slim" set of buckets for `/health` to
reduce the metrics load on large deployments. Especially those that have
per-node DNS caching services.

Add a metric to count internal health check failures rather than use the
timeout value as side effect monitor of the check error. This avoids
incorrectly recording the timeout value if there is an error that is not
a timeout (ex. refused)

Signed-off-by: SuperQ <superq@gmail.com>
2021-05-27 15:16:38 +02:00

63 lines
1.8 KiB
Go

package health
import (
"net/http"
"time"
"github.com/coredns/coredns/plugin"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
// overloaded queries the health end point and updates a metrics showing how long it took.
func (h *health) overloaded() {
timeout := time.Duration(3 * time.Second)
client := http.Client{
Timeout: timeout,
}
url := "http://" + h.Addr + "/health"
tick := time.NewTicker(1 * time.Second)
defer tick.Stop()
for {
select {
case <-tick.C:
start := time.Now()
resp, err := client.Get(url)
if err != nil {
HealthDuration.Observe(time.Since(start).Seconds())
HealthFailures.Inc()
log.Warningf("Local health request to %q failed: %s", url, err)
continue
}
resp.Body.Close()
elapsed := time.Since(start)
HealthDuration.Observe(elapsed.Seconds())
if elapsed > time.Second { // 1s is pretty random, but a *local* scrape taking that long isn't good
log.Warningf("Local health request to %q took more than 1s: %s", url, elapsed)
}
case <-h.stop:
return
}
}
}
var (
// HealthDuration is the metric used for exporting how fast we can retrieve the /health endpoint.
HealthDuration = promauto.NewHistogram(prometheus.HistogramOpts{
Namespace: plugin.Namespace,
Subsystem: "health",
Name: "request_duration_seconds",
Buckets: plugin.SlimTimeBuckets,
Help: "Histogram of the time (in seconds) each request took.",
})
// HealthFailures is the metric used to count how many times the thealth request failed
HealthFailures = promauto.NewCounter(prometheus.CounterOpts{
Namespace: plugin.Namespace,
Subsystem: "health",
Name: "request_failures_total",
Help: "The number of times the health check failed.",
})
)